mirror of
https://github.com/lkl/linux.git
synced 2025-12-19 16:13:19 +09:00
Merge drm/drm-next into drm-misc-next
Let's kick-off this release cycle. Signed-off-by: Maxime Ripard <maxime@cerno.tech>
This commit is contained in:
@@ -1,2 +1,4 @@
|
|||||||
|
Alan Cox <alan@lxorguk.ukuu.org.uk>
|
||||||
|
Alan Cox <root@hraefn.swansea.linux.org.uk>
|
||||||
Christoph Hellwig <hch@lst.de>
|
Christoph Hellwig <hch@lst.de>
|
||||||
Marc Gonzalez <marc.w.gonzalez@free.fr>
|
Marc Gonzalez <marc.w.gonzalez@free.fr>
|
||||||
|
|||||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -37,6 +37,8 @@
|
|||||||
*.o
|
*.o
|
||||||
*.o.*
|
*.o.*
|
||||||
*.patch
|
*.patch
|
||||||
|
*.rmeta
|
||||||
|
*.rsi
|
||||||
*.s
|
*.s
|
||||||
*.so
|
*.so
|
||||||
*.so.dbg
|
*.so.dbg
|
||||||
@@ -97,6 +99,7 @@ modules.order
|
|||||||
!.gitattributes
|
!.gitattributes
|
||||||
!.gitignore
|
!.gitignore
|
||||||
!.mailmap
|
!.mailmap
|
||||||
|
!.rustfmt.toml
|
||||||
|
|
||||||
#
|
#
|
||||||
# Generated include files
|
# Generated include files
|
||||||
@@ -162,3 +165,6 @@ x509.genkey
|
|||||||
|
|
||||||
# Documentation toolchain
|
# Documentation toolchain
|
||||||
sphinx_*/
|
sphinx_*/
|
||||||
|
|
||||||
|
# Rust analyzer configuration
|
||||||
|
/rust-project.json
|
||||||
|
|||||||
12
.mailmap
12
.mailmap
@@ -71,6 +71,9 @@ Ben M Cahill <ben.m.cahill@intel.com>
|
|||||||
Ben Widawsky <bwidawsk@kernel.org> <ben@bwidawsk.net>
|
Ben Widawsky <bwidawsk@kernel.org> <ben@bwidawsk.net>
|
||||||
Ben Widawsky <bwidawsk@kernel.org> <ben.widawsky@intel.com>
|
Ben Widawsky <bwidawsk@kernel.org> <ben.widawsky@intel.com>
|
||||||
Ben Widawsky <bwidawsk@kernel.org> <benjamin.widawsky@intel.com>
|
Ben Widawsky <bwidawsk@kernel.org> <benjamin.widawsky@intel.com>
|
||||||
|
Bjorn Andersson <andersson@kernel.org> <bjorn@kryo.se>
|
||||||
|
Bjorn Andersson <andersson@kernel.org> <bjorn.andersson@linaro.org>
|
||||||
|
Bjorn Andersson <andersson@kernel.org> <bjorn.andersson@sonymobile.com>
|
||||||
Björn Steinbrink <B.Steinbrink@gmx.de>
|
Björn Steinbrink <B.Steinbrink@gmx.de>
|
||||||
Björn Töpel <bjorn@kernel.org> <bjorn.topel@gmail.com>
|
Björn Töpel <bjorn@kernel.org> <bjorn.topel@gmail.com>
|
||||||
Björn Töpel <bjorn@kernel.org> <bjorn.topel@intel.com>
|
Björn Töpel <bjorn@kernel.org> <bjorn.topel@intel.com>
|
||||||
@@ -98,8 +101,7 @@ Christian Brauner <brauner@kernel.org> <christian.brauner@ubuntu.com>
|
|||||||
Christian Marangi <ansuelsmth@gmail.com>
|
Christian Marangi <ansuelsmth@gmail.com>
|
||||||
Christophe Ricard <christophe.ricard@gmail.com>
|
Christophe Ricard <christophe.ricard@gmail.com>
|
||||||
Christoph Hellwig <hch@lst.de>
|
Christoph Hellwig <hch@lst.de>
|
||||||
Colin Ian King <colin.king@intel.com> <colin.king@canonical.com>
|
Colin Ian King <colin.i.king@gmail.com> <colin.king@canonical.com>
|
||||||
Colin Ian King <colin.king@intel.com> <colin.i.king@gmail.com>
|
|
||||||
Corey Minyard <minyard@acm.org>
|
Corey Minyard <minyard@acm.org>
|
||||||
Damian Hobson-Garcia <dhobsong@igel.co.jp>
|
Damian Hobson-Garcia <dhobsong@igel.co.jp>
|
||||||
Daniel Borkmann <daniel@iogearbox.net> <danborkmann@googlemail.com>
|
Daniel Borkmann <daniel@iogearbox.net> <danborkmann@googlemail.com>
|
||||||
@@ -135,6 +137,7 @@ Filipe Lautert <filipe@icewall.org>
|
|||||||
Finn Thain <fthain@linux-m68k.org> <fthain@telegraphics.com.au>
|
Finn Thain <fthain@linux-m68k.org> <fthain@telegraphics.com.au>
|
||||||
Franck Bui-Huu <vagabon.xyz@gmail.com>
|
Franck Bui-Huu <vagabon.xyz@gmail.com>
|
||||||
Frank Rowand <frowand.list@gmail.com> <frank.rowand@am.sony.com>
|
Frank Rowand <frowand.list@gmail.com> <frank.rowand@am.sony.com>
|
||||||
|
Frank Rowand <frowand.list@gmail.com> <frank.rowand@sony.com>
|
||||||
Frank Rowand <frowand.list@gmail.com> <frank.rowand@sonymobile.com>
|
Frank Rowand <frowand.list@gmail.com> <frank.rowand@sonymobile.com>
|
||||||
Frank Rowand <frowand.list@gmail.com> <frowand@mvista.com>
|
Frank Rowand <frowand.list@gmail.com> <frowand@mvista.com>
|
||||||
Frank Zago <fzago@systemfabricworks.com>
|
Frank Zago <fzago@systemfabricworks.com>
|
||||||
@@ -150,6 +153,8 @@ Greg Kroah-Hartman <gregkh@suse.de>
|
|||||||
Greg Kroah-Hartman <greg@kroah.com>
|
Greg Kroah-Hartman <greg@kroah.com>
|
||||||
Greg Kurz <groug@kaod.org> <gkurz@linux.vnet.ibm.com>
|
Greg Kurz <groug@kaod.org> <gkurz@linux.vnet.ibm.com>
|
||||||
Gregory CLEMENT <gregory.clement@bootlin.com> <gregory.clement@free-electrons.com>
|
Gregory CLEMENT <gregory.clement@bootlin.com> <gregory.clement@free-electrons.com>
|
||||||
|
Guilherme G. Piccoli <kernel@gpiccoli.net> <gpiccoli@linux.vnet.ibm.com>
|
||||||
|
Guilherme G. Piccoli <kernel@gpiccoli.net> <gpiccoli@canonical.com>
|
||||||
Guo Ren <guoren@kernel.org> <guoren@linux.alibaba.com>
|
Guo Ren <guoren@kernel.org> <guoren@linux.alibaba.com>
|
||||||
Guo Ren <guoren@kernel.org> <ren_guo@c-sky.com>
|
Guo Ren <guoren@kernel.org> <ren_guo@c-sky.com>
|
||||||
Gustavo Padovan <gustavo@las.ic.unicamp.br>
|
Gustavo Padovan <gustavo@las.ic.unicamp.br>
|
||||||
@@ -253,6 +258,7 @@ Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
|
|||||||
Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
|
Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
|
||||||
Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>
|
Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>
|
||||||
Lorenzo Pieralisi <lpieralisi@kernel.org> <lorenzo.pieralisi@arm.com>
|
Lorenzo Pieralisi <lpieralisi@kernel.org> <lorenzo.pieralisi@arm.com>
|
||||||
|
Luca Ceresoli <luca.ceresoli@bootlin.com> <luca@lucaceresoli.net>
|
||||||
Lukasz Luba <lukasz.luba@arm.com> <l.luba@partner.samsung.com>
|
Lukasz Luba <lukasz.luba@arm.com> <l.luba@partner.samsung.com>
|
||||||
Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
|
Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
|
||||||
Maciej W. Rozycki <macro@orcam.me.uk> <macro@linux-mips.org>
|
Maciej W. Rozycki <macro@orcam.me.uk> <macro@linux-mips.org>
|
||||||
@@ -313,6 +319,7 @@ Morten Welinder <welinder@troll.com>
|
|||||||
Mythri P K <mythripk@ti.com>
|
Mythri P K <mythripk@ti.com>
|
||||||
Nadia Yvette Chambers <nyc@holomorphy.com> William Lee Irwin III <wli@holomorphy.com>
|
Nadia Yvette Chambers <nyc@holomorphy.com> William Lee Irwin III <wli@holomorphy.com>
|
||||||
Nathan Chancellor <nathan@kernel.org> <natechancellor@gmail.com>
|
Nathan Chancellor <nathan@kernel.org> <natechancellor@gmail.com>
|
||||||
|
Neil Armstrong <neil.armstrong@linaro.org> <narmstrong@baylibre.com>
|
||||||
Nguyen Anh Quynh <aquynh@gmail.com>
|
Nguyen Anh Quynh <aquynh@gmail.com>
|
||||||
Nicholas Piggin <npiggin@gmail.com> <npiggen@suse.de>
|
Nicholas Piggin <npiggin@gmail.com> <npiggen@suse.de>
|
||||||
Nicholas Piggin <npiggin@gmail.com> <npiggin@kernel.dk>
|
Nicholas Piggin <npiggin@gmail.com> <npiggin@kernel.dk>
|
||||||
@@ -330,6 +337,7 @@ Oleksij Rempel <linux@rempel-privat.de> <external.Oleksij.Rempel@de.bosch.com>
|
|||||||
Oleksij Rempel <linux@rempel-privat.de> <fixed-term.Oleksij.Rempel@de.bosch.com>
|
Oleksij Rempel <linux@rempel-privat.de> <fixed-term.Oleksij.Rempel@de.bosch.com>
|
||||||
Oleksij Rempel <linux@rempel-privat.de> <o.rempel@pengutronix.de>
|
Oleksij Rempel <linux@rempel-privat.de> <o.rempel@pengutronix.de>
|
||||||
Oleksij Rempel <linux@rempel-privat.de> <ore@pengutronix.de>
|
Oleksij Rempel <linux@rempel-privat.de> <ore@pengutronix.de>
|
||||||
|
Oliver Upton <oliver.upton@linux.dev> <oupton@google.com>
|
||||||
Pali Rohár <pali@kernel.org> <pali.rohar@gmail.com>
|
Pali Rohár <pali@kernel.org> <pali.rohar@gmail.com>
|
||||||
Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
|
Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
|
||||||
Patrick Mochel <mochel@digitalimplant.org>
|
Patrick Mochel <mochel@digitalimplant.org>
|
||||||
|
|||||||
12
.rustfmt.toml
Normal file
12
.rustfmt.toml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
edition = "2021"
|
||||||
|
newline_style = "Unix"
|
||||||
|
|
||||||
|
# Unstable options that help catching some mistakes in formatting and that we may want to enable
|
||||||
|
# when they become stable.
|
||||||
|
#
|
||||||
|
# They are kept here since they are useful to run from time to time.
|
||||||
|
#format_code_in_doc_comments = true
|
||||||
|
#reorder_impl_items = true
|
||||||
|
#comment_width = 100
|
||||||
|
#wrap_comments = true
|
||||||
|
#normalize_comments = true
|
||||||
@@ -227,6 +227,17 @@ Contact: dmaengine@vger.kernel.org
|
|||||||
Description: Indicate the number of retires for an enqcmds submission on a sharedwq.
|
Description: Indicate the number of retires for an enqcmds submission on a sharedwq.
|
||||||
A max value to set attribute is capped at 64.
|
A max value to set attribute is capped at 64.
|
||||||
|
|
||||||
|
What: /sys/bus/dsa/devices/wq<m>.<n>/op_config
|
||||||
|
Date: Sept 14, 2022
|
||||||
|
KernelVersion: 6.0.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: Shows the operation capability bits displayed in bitmap format
|
||||||
|
presented by %*pb printk() output format specifier.
|
||||||
|
The attribute can be configured when the WQ is disabled in
|
||||||
|
order to configure the WQ to accept specific bits that
|
||||||
|
correlates to the operations allowed. It's visible only
|
||||||
|
on platforms that support the capability.
|
||||||
|
|
||||||
What: /sys/bus/dsa/devices/engine<m>.<n>/group_id
|
What: /sys/bus/dsa/devices/engine<m>.<n>/group_id
|
||||||
Date: Oct 25, 2019
|
Date: Oct 25, 2019
|
||||||
KernelVersion: 5.6.0
|
KernelVersion: 5.6.0
|
||||||
@@ -255,3 +266,27 @@ Contact: dmaengine@vger.kernel.org
|
|||||||
Description: Indicates the number of Read Buffers reserved for the use of
|
Description: Indicates the number of Read Buffers reserved for the use of
|
||||||
engines in the group. See DSA spec v1.2 9.2.18 GRPCFG Read Buffers
|
engines in the group. See DSA spec v1.2 9.2.18 GRPCFG Read Buffers
|
||||||
Reserved.
|
Reserved.
|
||||||
|
|
||||||
|
What: /sys/bus/dsa/devices/group<m>.<n>/desc_progress_limit
|
||||||
|
Date: Sept 14, 2022
|
||||||
|
KernelVersion: 6.0.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: Allows control of the number of work descriptors that can be
|
||||||
|
concurrently processed by an engine in the group as a fraction
|
||||||
|
of the Maximum Work Descriptors in Progress value specified in
|
||||||
|
the ENGCAP register. The acceptable values are 0 (default),
|
||||||
|
1 (1/2 of max value), 2 (1/4 of the max value), and 3 (1/8 of
|
||||||
|
the max value). It's visible only on platforms that support
|
||||||
|
the capability.
|
||||||
|
|
||||||
|
What: /sys/bus/dsa/devices/group<m>.<n>/batch_progress_limit
|
||||||
|
Date: Sept 14, 2022
|
||||||
|
KernelVersion: 6.0.0
|
||||||
|
Contact: dmaengine@vger.kernel.org
|
||||||
|
Description: Allows control of the number of batch descriptors that can be
|
||||||
|
concurrently processed by an engine in the group as a fraction
|
||||||
|
of the Maximum Batch Descriptors in Progress value specified in
|
||||||
|
the ENGCAP register. The acceptable values are 0 (default),
|
||||||
|
1 (1/2 of max value), 2 (1/4 of the max value), and 3 (1/8 of
|
||||||
|
the max value). It's visible only on platforms that support
|
||||||
|
the capability.
|
||||||
|
|||||||
@@ -54,3 +54,25 @@ Description:
|
|||||||
this feature.
|
this feature.
|
||||||
|
|
||||||
Output will be in the format: "0x%08x\n".
|
Output will be in the format: "0x%08x\n".
|
||||||
|
|
||||||
|
What: /sys/kernel/debug/<cros-ec-device>/suspend_timeout_ms
|
||||||
|
Date: August 2022
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Description:
|
||||||
|
Some ECs have a feature where they will track transitions of
|
||||||
|
a hardware-controlled sleep line, such as Intel's SLP_S0 line,
|
||||||
|
in order to detect cases where a system failed to go into deep
|
||||||
|
sleep states. The suspend_timeout_ms file controls the amount of
|
||||||
|
time in milliseconds the EC will wait before declaring a sleep
|
||||||
|
timeout event and attempting to wake the system.
|
||||||
|
|
||||||
|
Supply 0 to use the default value coded into EC firmware. Supply
|
||||||
|
65535 (EC_HOST_SLEEP_TIMEOUT_INFINITE) to disable the EC sleep
|
||||||
|
failure detection mechanism. Values in between 0 and 65535
|
||||||
|
indicate the number of milliseconds the EC should wait after a
|
||||||
|
sleep transition before declaring a timeout. This includes both
|
||||||
|
the duration after a sleep command was received but before the
|
||||||
|
hardware line changed, as well as the duration between when the
|
||||||
|
hardware line changed and the kernel sent an EC resume command.
|
||||||
|
|
||||||
|
Output will be in the format: "%u\n".
|
||||||
|
|||||||
13
Documentation/ABI/testing/sysfs-amd-pmc
Normal file
13
Documentation/ABI/testing/sysfs-amd-pmc
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
What: /sys/bus/platform/drivers/amd_pmc/*/smu_fw_version
|
||||||
|
Date: October 2022
|
||||||
|
Contact: Mario Limonciello <mario.limonciello@amd.com>
|
||||||
|
Description: Reading this file reports the version of the firmware loaded to
|
||||||
|
System Management Unit (SMU) contained in AMD CPUs and
|
||||||
|
APUs.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/drivers/amd_pmc/*/smu_program
|
||||||
|
Date: October 2022
|
||||||
|
Contact: Mario Limonciello <mario.limonciello@amd.com>
|
||||||
|
Description: Reading this file reports the program corresponding to the SMU
|
||||||
|
firmware version. The program field is used to disambiguate two
|
||||||
|
APU/CPU models that can share the same firmware binary.
|
||||||
13
Documentation/ABI/testing/sysfs-amd-pmf
Normal file
13
Documentation/ABI/testing/sysfs-amd-pmf
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
What: /sys/devices/platform/*/cnqf_enable
|
||||||
|
Date: September 2022
|
||||||
|
Contact: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
|
||||||
|
Description: Reading this file tells if the AMD Platform Management(PMF)
|
||||||
|
Cool n Quiet Framework(CnQF) feature is enabled or not.
|
||||||
|
|
||||||
|
This feature is not enabled by default and gets only turned on
|
||||||
|
if OEM BIOS passes a "flag" to PMF ACPI function (index 11 or 12)
|
||||||
|
or in case the user writes "on".
|
||||||
|
|
||||||
|
To turn off CnQF user can write "off" to the sysfs node.
|
||||||
|
Note: Systems that support auto mode will not have this sysfs file
|
||||||
|
available.
|
||||||
@@ -3,7 +3,7 @@ Date: May 2011
|
|||||||
KernelVersion: 3.0
|
KernelVersion: 3.0
|
||||||
Contact: Rafał Miłecki <zajec5@gmail.com>
|
Contact: Rafał Miłecki <zajec5@gmail.com>
|
||||||
Description:
|
Description:
|
||||||
Each BCMA core has it's manufacturer id. See
|
Each BCMA core has its manufacturer id. See
|
||||||
include/linux/bcma/bcma.h for possible values.
|
include/linux/bcma/bcma.h for possible values.
|
||||||
|
|
||||||
What: /sys/bus/bcma/devices/.../id
|
What: /sys/bus/bcma/devices/.../id
|
||||||
|
|||||||
@@ -516,3 +516,11 @@ Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
|||||||
Description: (Read) Returns the number of special conditional P1 right-hand keys
|
Description: (Read) Returns the number of special conditional P1 right-hand keys
|
||||||
that the trace unit can use (0x194). The value is taken
|
that the trace unit can use (0x194). The value is taken
|
||||||
directly from the HW.
|
directly from the HW.
|
||||||
|
|
||||||
|
What: /sys/bus/coresight/devices/etm<N>/ts_source
|
||||||
|
Date: October 2022
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: Mathieu Poirier <mathieu.poirier@linaro.org> or Suzuki K Poulose <suzuki.poulose@arm.com>
|
||||||
|
Description: (Read) When FEAT_TRF is implemented, value of TRFCR_ELx.TS used for
|
||||||
|
trace session. Otherwise -1 indicates an unknown time source. Check
|
||||||
|
trcidr0.tssize to see if a global timestamp is available.
|
||||||
|
|||||||
@@ -4,6 +4,12 @@ Contact: linux-iio@vger.kernel.org
|
|||||||
Description:
|
Description:
|
||||||
Count data of Count Y represented as a string.
|
Count data of Count Y represented as a string.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/capture
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Historical capture of the Count Y count data.
|
||||||
|
|
||||||
What: /sys/bus/counter/devices/counterX/countY/ceiling
|
What: /sys/bus/counter/devices/counterX/countY/ceiling
|
||||||
KernelVersion: 5.2
|
KernelVersion: 5.2
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
@@ -203,6 +209,13 @@ Description:
|
|||||||
both edges:
|
both edges:
|
||||||
Any state transition.
|
Any state transition.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/num_overflows
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
This attribute indicates the number of overflows of count Y.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/capture_component_id
|
||||||
What: /sys/bus/counter/devices/counterX/countY/ceiling_component_id
|
What: /sys/bus/counter/devices/counterX/countY/ceiling_component_id
|
||||||
What: /sys/bus/counter/devices/counterX/countY/floor_component_id
|
What: /sys/bus/counter/devices/counterX/countY/floor_component_id
|
||||||
What: /sys/bus/counter/devices/counterX/countY/count_mode_component_id
|
What: /sys/bus/counter/devices/counterX/countY/count_mode_component_id
|
||||||
@@ -213,11 +226,14 @@ What: /sys/bus/counter/devices/counterX/countY/prescaler_component_id
|
|||||||
What: /sys/bus/counter/devices/counterX/countY/preset_component_id
|
What: /sys/bus/counter/devices/counterX/countY/preset_component_id
|
||||||
What: /sys/bus/counter/devices/counterX/countY/preset_enable_component_id
|
What: /sys/bus/counter/devices/counterX/countY/preset_enable_component_id
|
||||||
What: /sys/bus/counter/devices/counterX/countY/signalZ_action_component_id
|
What: /sys/bus/counter/devices/counterX/countY/signalZ_action_component_id
|
||||||
|
What: /sys/bus/counter/devices/counterX/countY/num_overflows_component_id
|
||||||
What: /sys/bus/counter/devices/counterX/signalY/cable_fault_component_id
|
What: /sys/bus/counter/devices/counterX/signalY/cable_fault_component_id
|
||||||
What: /sys/bus/counter/devices/counterX/signalY/cable_fault_enable_component_id
|
What: /sys/bus/counter/devices/counterX/signalY/cable_fault_enable_component_id
|
||||||
What: /sys/bus/counter/devices/counterX/signalY/filter_clock_prescaler_component_id
|
What: /sys/bus/counter/devices/counterX/signalY/filter_clock_prescaler_component_id
|
||||||
What: /sys/bus/counter/devices/counterX/signalY/index_polarity_component_id
|
What: /sys/bus/counter/devices/counterX/signalY/index_polarity_component_id
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/polarity_component_id
|
||||||
What: /sys/bus/counter/devices/counterX/signalY/synchronous_mode_component_id
|
What: /sys/bus/counter/devices/counterX/signalY/synchronous_mode_component_id
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/frequency_component_id
|
||||||
KernelVersion: 5.16
|
KernelVersion: 5.16
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
@@ -303,6 +319,19 @@ Description:
|
|||||||
Discrete set of available values for the respective Signal Y
|
Discrete set of available values for the respective Signal Y
|
||||||
configuration are listed in this file.
|
configuration are listed in this file.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/polarity
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Active level of Signal Y. The following polarity values are
|
||||||
|
available:
|
||||||
|
|
||||||
|
positive:
|
||||||
|
Signal high state considered active level (rising edge).
|
||||||
|
|
||||||
|
negative:
|
||||||
|
Signal low state considered active level (falling edge).
|
||||||
|
|
||||||
What: /sys/bus/counter/devices/counterX/signalY/name
|
What: /sys/bus/counter/devices/counterX/signalY/name
|
||||||
KernelVersion: 5.2
|
KernelVersion: 5.2
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
@@ -345,3 +374,9 @@ Description:
|
|||||||
via index_polarity. The index function (as enabled via
|
via index_polarity. The index function (as enabled via
|
||||||
preset_enable) is performed synchronously with the
|
preset_enable) is performed synchronously with the
|
||||||
quadrature clock on the active level of the index input.
|
quadrature clock on the active level of the index input.
|
||||||
|
|
||||||
|
What: /sys/bus/counter/devices/counterX/signalY/frequency
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Read-only attribute that indicates the signal Y frequency, in Hz.
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ Description: 'FCoE Controller' instances on the fcoe bus.
|
|||||||
1) Write interface name to ctlr_create 2) Configure the FCoE
|
1) Write interface name to ctlr_create 2) Configure the FCoE
|
||||||
Controller (ctlr_X) 3) Enable the FCoE Controller to begin
|
Controller (ctlr_X) 3) Enable the FCoE Controller to begin
|
||||||
discovery and login. The FCoE Controller is destroyed by
|
discovery and login. The FCoE Controller is destroyed by
|
||||||
writing it's name, i.e. ctlr_X to the ctlr_delete file.
|
writing its name, i.e. ctlr_X to the ctlr_delete file.
|
||||||
|
|
||||||
Attributes:
|
Attributes:
|
||||||
|
|
||||||
|
|||||||
@@ -196,7 +196,7 @@ Description:
|
|||||||
Raw capacitance measurement from channel Y. Units after
|
Raw capacitance measurement from channel Y. Units after
|
||||||
application of scale and offset are nanofarads.
|
application of scale and offset are nanofarads.
|
||||||
|
|
||||||
What: /sys/.../iio:deviceX/in_capacitanceY-in_capacitanceZ_raw
|
What: /sys/.../iio:deviceX/in_capacitanceY-capacitanceZ_raw
|
||||||
KernelVersion: 3.2
|
KernelVersion: 3.2
|
||||||
Contact: linux-iio@vger.kernel.org
|
Contact: linux-iio@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
@@ -207,6 +207,25 @@ Description:
|
|||||||
is required is a consistent labeling. Units after application
|
is required is a consistent labeling. Units after application
|
||||||
of scale and offset are nanofarads.
|
of scale and offset are nanofarads.
|
||||||
|
|
||||||
|
What: /sys/.../iio:deviceX/in_capacitanceY-capacitanceZ_zeropoint
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
For differential channels, this an offset that is applied
|
||||||
|
equally to both inputs. As the reading is of the difference
|
||||||
|
between the two inputs, this should not be applied to the _raw
|
||||||
|
reading by userspace (unlike _offset) and unlike calibbias
|
||||||
|
it does not affect the differential value measured because
|
||||||
|
the effect of _zeropoint cancels out across the two inputs
|
||||||
|
that make up the differential pair. It's purpose is to bring
|
||||||
|
the individual signals, before the differential is measured,
|
||||||
|
within the measurement range of the device. The naming is
|
||||||
|
chosen because if the separate inputs that make the
|
||||||
|
differential pair are drawn on a graph in their
|
||||||
|
_raw units, this is the value that the zero point on the
|
||||||
|
measurement axis represents. It is expressed with the
|
||||||
|
same scaling as _raw.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_temp_raw
|
What: /sys/bus/iio/devices/iio:deviceX/in_temp_raw
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_tempX_raw
|
What: /sys/bus/iio/devices/iio:deviceX/in_tempX_raw
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_temp_x_raw
|
What: /sys/bus/iio/devices/iio:deviceX/in_temp_x_raw
|
||||||
@@ -241,6 +260,15 @@ Description:
|
|||||||
Has all of the equivalent parameters as per voltageY. Units
|
Has all of the equivalent parameters as per voltageY. Units
|
||||||
after application of scale and offset are m/s^2.
|
after application of scale and offset are m/s^2.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_accel_linear_x_raw
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_accel_linear_y_raw
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_accel_linear_z_raw
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
As per in_accel_X_raw attributes, but minus the
|
||||||
|
acceleration due to gravity.
|
||||||
|
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_gravity_x_raw
|
What: /sys/bus/iio/devices/iio:deviceX/in_gravity_x_raw
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_gravity_y_raw
|
What: /sys/bus/iio/devices/iio:deviceX/in_gravity_y_raw
|
||||||
What: /sys/bus/iio/devices/iio:deviceX/in_gravity_z_raw
|
What: /sys/bus/iio/devices/iio:deviceX/in_gravity_z_raw
|
||||||
@@ -2038,3 +2066,99 @@ Description:
|
|||||||
Available range for the forced calibration value, expressed as:
|
Available range for the forced calibration value, expressed as:
|
||||||
|
|
||||||
- a range specified as "[min step max]"
|
- a range specified as "[min step max]"
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_voltageX_sampling_frequency
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_powerY_sampling_frequency
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_currentZ_sampling_frequency
|
||||||
|
KernelVersion: 5.20
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Some devices have separate controls of sampling frequency for
|
||||||
|
individual channels. If multiple channels are enabled in a scan,
|
||||||
|
then the sampling_frequency of the scan may be computed from the
|
||||||
|
per channel sampling frequencies.
|
||||||
|
|
||||||
|
What: /sys/.../events/in_accel_gesture_singletap_en
|
||||||
|
What: /sys/.../events/in_accel_gesture_doubletap_en
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Device generates an event on a single or double tap.
|
||||||
|
|
||||||
|
What: /sys/.../events/in_accel_gesture_singletap_value
|
||||||
|
What: /sys/.../events/in_accel_gesture_doubletap_value
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Specifies the threshold value that the device is comparing
|
||||||
|
against to generate the tap gesture event. The lower
|
||||||
|
threshold value increases the sensitivity of tap detection.
|
||||||
|
Units and the exact meaning of value are device-specific.
|
||||||
|
|
||||||
|
What: /sys/.../events/in_accel_gesture_tap_value_available
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Lists all available threshold values which can be used to
|
||||||
|
modify the sensitivity of the tap detection.
|
||||||
|
|
||||||
|
What: /sys/.../events/in_accel_gesture_singletap_reset_timeout
|
||||||
|
What: /sys/.../events/in_accel_gesture_doubletap_reset_timeout
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Specifies the timeout value in seconds for the tap detector
|
||||||
|
to not to look for another tap event after the event as
|
||||||
|
occurred. Basically the minimum quiet time between the two
|
||||||
|
single-tap's or two double-tap's.
|
||||||
|
|
||||||
|
What: /sys/.../events/in_accel_gesture_tap_reset_timeout_available
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Lists all available tap reset timeout values. Units in seconds.
|
||||||
|
|
||||||
|
What: /sys/.../events/in_accel_gesture_doubletap_tap2_min_delay
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Specifies the minimum quiet time in seconds between the two
|
||||||
|
taps of a double tap.
|
||||||
|
|
||||||
|
What: /sys/.../events/in_accel_gesture_doubletap_tap2_min_delay_available
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Lists all available delay values between two taps in the double
|
||||||
|
tap. Units in seconds.
|
||||||
|
|
||||||
|
What: /sys/.../events/in_accel_gesture_tap_maxtomin_time
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Specifies the maximum time difference allowed between upper
|
||||||
|
and lower peak of tap to consider it as the valid tap event.
|
||||||
|
Units in seconds.
|
||||||
|
|
||||||
|
What: /sys/.../events/in_accel_gesture_tap_maxtomin_time_available
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Lists all available time values between upper peak to lower
|
||||||
|
peak. Units in seconds.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_rot_yaw_raw
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_rot_pitch_raw
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_rot_roll_raw
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Raw (unscaled) euler angles readings. Units after
|
||||||
|
application of scale are deg.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/serialnumber
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
An example format is 16-bytes, 2-digits-per-byte, HEX-string
|
||||||
|
representing the sensor unique ID number.
|
||||||
|
|||||||
81
Documentation/ABI/testing/sysfs-bus-iio-bno055
Normal file
81
Documentation/ABI/testing/sysfs-bus-iio-bno055
Normal file
@@ -0,0 +1,81 @@
|
|||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_accel_raw_range
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Raw (unscaled) range for acceleration readings. Unit after
|
||||||
|
application of scale is m/s^2. Note that this doesn't affects
|
||||||
|
the scale (which should be used when changing the maximum and
|
||||||
|
minimum readable value affects also the reading scaling factor).
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_anglvel_raw_range
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Range for angular velocity readings in radians per second. Note
|
||||||
|
that this does not affects the scale (which should be used when
|
||||||
|
changing the maximum and minimum readable value affects also the
|
||||||
|
reading scaling factor).
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_accel_raw_range_available
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
List of allowed values for in_accel_raw_range attribute
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_anglvel_raw_range_available
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
List of allowed values for in_anglvel_raw_range attribute
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_magn_calibration_fast_enable
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Can be 1 or 0. Enables/disables the "Fast Magnetometer
|
||||||
|
Calibration" HW function.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/fusion_enable
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Can be 1 or 0. Enables/disables the "sensor fusion" (a.k.a.
|
||||||
|
NDOF) HW function.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/calibration_data
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Reports the binary calibration data blob for the IMU sensors.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_accel_calibration_auto_status
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Reports the autocalibration status for the accelerometer sensor.
|
||||||
|
Can be 0 (calibration non even enabled) or 1 to 5 where the greater
|
||||||
|
the number, the better the calibration status.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_gyro_calibration_auto_status
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Reports the autocalibration status for the gyroscope sensor.
|
||||||
|
Can be 0 (calibration non even enabled) or 1 to 5 where the greater
|
||||||
|
the number, the better the calibration status.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/in_magn_calibration_auto_status
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Reports the autocalibration status for the magnetometer sensor.
|
||||||
|
Can be 0 (calibration non even enabled) or 1 to 5 where the greater
|
||||||
|
the number, the better the calibration status.
|
||||||
|
|
||||||
|
What: /sys/bus/iio/devices/iio:deviceX/sys_calibration_auto_status
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Reports the status for the IMU overall autocalibration.
|
||||||
|
Can be 0 (calibration non even enabled) or 1 to 5 where the greater
|
||||||
|
the number, the better the calibration status.
|
||||||
11
Documentation/ABI/testing/sysfs-bus-iio-cdc-ad7746
Normal file
11
Documentation/ABI/testing/sysfs-bus-iio-cdc-ad7746
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
What: /sys/.../iio:deviceX/in_capacitableY_calibbias_calibration
|
||||||
|
What: /sys/.../iio:deviceX/in_capacitableY_calibscale_calibration
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: linux-iio@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
Write 1 to trigger a calibration of the calibbias or
|
||||||
|
calibscale. For calibscale, a full scale capacitance should
|
||||||
|
be connected to the capacitance input and a
|
||||||
|
calibscale_calibration then started. For calibbias see
|
||||||
|
the device datasheet section on "capacitive system offset
|
||||||
|
calibration".
|
||||||
@@ -18,7 +18,7 @@ Description:
|
|||||||
on the signal from which time of flight measurements are
|
on the signal from which time of flight measurements are
|
||||||
taken.
|
taken.
|
||||||
The appropriate values to take is dependent on both the
|
The appropriate values to take is dependent on both the
|
||||||
sensor and it's operating environment:
|
sensor and its operating environment:
|
||||||
* as3935 (0-31 range)
|
* as3935 (0-31 range)
|
||||||
18 = indoors (default)
|
18 = indoors (default)
|
||||||
14 = outdoors
|
14 = outdoors
|
||||||
|
|||||||
@@ -457,3 +457,36 @@ Description:
|
|||||||
|
|
||||||
The file is writable if the PF is bound to a driver that
|
The file is writable if the PF is bound to a driver that
|
||||||
implements ->sriov_set_msix_vec_count().
|
implements ->sriov_set_msix_vec_count().
|
||||||
|
|
||||||
|
What: /sys/bus/pci/devices/.../resourceN_resize
|
||||||
|
Date: September 2022
|
||||||
|
Contact: Alex Williamson <alex.williamson@redhat.com>
|
||||||
|
Description:
|
||||||
|
These files provide an interface to PCIe Resizable BAR support.
|
||||||
|
A file is created for each BAR resource (N) supported by the
|
||||||
|
PCIe Resizable BAR extended capability of the device. Reading
|
||||||
|
each file exposes the bitmap of available resource sizes:
|
||||||
|
|
||||||
|
# cat resource1_resize
|
||||||
|
00000000000001c0
|
||||||
|
|
||||||
|
The bitmap represents supported resource sizes for the BAR,
|
||||||
|
where bit0 = 1MB, bit1 = 2MB, bit2 = 4MB, etc. In the above
|
||||||
|
example the device supports 64MB, 128MB, and 256MB BAR sizes.
|
||||||
|
|
||||||
|
When writing the file, the user provides the bit position of
|
||||||
|
the desired resource size, for example:
|
||||||
|
|
||||||
|
# echo 7 > resource1_resize
|
||||||
|
|
||||||
|
This indicates to set the size value corresponding to bit 7,
|
||||||
|
128MB. The resulting size is 2 ^ (bit# + 20). This definition
|
||||||
|
matches the PCIe specification of this capability.
|
||||||
|
|
||||||
|
In order to make use of resource resizing, all PCI drivers must
|
||||||
|
be unbound from the device and peer devices under the same
|
||||||
|
parent bridge may need to be soft removed. In the case of
|
||||||
|
VGA devices, writing a resize value will remove low level
|
||||||
|
console drivers from the device. Raw users of pci-sysfs
|
||||||
|
resourceN attributes must be terminated prior to resizing.
|
||||||
|
Success of the resizing operation is not guaranteed.
|
||||||
|
|||||||
@@ -153,7 +153,7 @@ Date: Jan 2020
|
|||||||
KernelVersion: 5.5
|
KernelVersion: 5.5
|
||||||
Contact: Mika Westerberg <mika.westerberg@linux.intel.com>
|
Contact: Mika Westerberg <mika.westerberg@linux.intel.com>
|
||||||
Description: This attribute reports number of RX lanes the device is
|
Description: This attribute reports number of RX lanes the device is
|
||||||
using simultaneusly through its upstream port.
|
using simultaneously through its upstream port.
|
||||||
|
|
||||||
What: /sys/bus/thunderbolt/devices/.../tx_speed
|
What: /sys/bus/thunderbolt/devices/.../tx_speed
|
||||||
Date: Jan 2020
|
Date: Jan 2020
|
||||||
@@ -167,7 +167,7 @@ Date: Jan 2020
|
|||||||
KernelVersion: 5.5
|
KernelVersion: 5.5
|
||||||
Contact: Mika Westerberg <mika.westerberg@linux.intel.com>
|
Contact: Mika Westerberg <mika.westerberg@linux.intel.com>
|
||||||
Description: This attribute reports number of TX lanes the device is
|
Description: This attribute reports number of TX lanes the device is
|
||||||
using simultaneusly through its upstream port.
|
using simultaneously through its upstream port.
|
||||||
|
|
||||||
What: /sys/bus/thunderbolt/devices/.../vendor
|
What: /sys/bus/thunderbolt/devices/.../vendor
|
||||||
Date: Sep 2017
|
Date: Sep 2017
|
||||||
|
|||||||
@@ -364,7 +364,10 @@ Date: April 2019
|
|||||||
Contact: linux-pm@vger.kernel.org
|
Contact: linux-pm@vger.kernel.org
|
||||||
Description:
|
Description:
|
||||||
Represents a battery percentage level, above which charging will
|
Represents a battery percentage level, above which charging will
|
||||||
stop.
|
stop. Not all hardware is capable of setting this to an arbitrary
|
||||||
|
percentage. Drivers will round written values to the nearest
|
||||||
|
supported value. Reading back the value will show the actual
|
||||||
|
threshold set by the driver.
|
||||||
|
|
||||||
Access: Read, Write
|
Access: Read, Write
|
||||||
|
|
||||||
|
|||||||
61
Documentation/ABI/testing/sysfs-devices-hisi_ptt
Normal file
61
Documentation/ABI/testing/sysfs-devices-hisi_ptt
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune
|
||||||
|
Date: October 2022
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: Yicong Yang <yangyicong@hisilicon.com>
|
||||||
|
Description: This directory contains files for tuning the PCIe link
|
||||||
|
parameters(events). Each file is named after the event
|
||||||
|
of the PCIe link.
|
||||||
|
|
||||||
|
See Documentation/trace/hisi-ptt.rst for more information.
|
||||||
|
|
||||||
|
What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/qos_tx_cpl
|
||||||
|
Date: October 2022
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: Yicong Yang <yangyicong@hisilicon.com>
|
||||||
|
Description: (RW) Controls the weight of Tx completion TLPs, which influence
|
||||||
|
the proportion of outbound completion TLPs on the PCIe link.
|
||||||
|
The available tune data is [0, 1, 2]. Writing a negative value
|
||||||
|
will return an error, and out of range values will be converted
|
||||||
|
to 2. The value indicates a probable level of the event.
|
||||||
|
|
||||||
|
What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/qos_tx_np
|
||||||
|
Date: October 2022
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: Yicong Yang <yangyicong@hisilicon.com>
|
||||||
|
Description: (RW) Controls the weight of Tx non-posted TLPs, which influence
|
||||||
|
the proportion of outbound non-posted TLPs on the PCIe link.
|
||||||
|
The available tune data is [0, 1, 2]. Writing a negative value
|
||||||
|
will return an error, and out of range values will be converted
|
||||||
|
to 2. The value indicates a probable level of the event.
|
||||||
|
|
||||||
|
What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/qos_tx_p
|
||||||
|
Date: October 2022
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: Yicong Yang <yangyicong@hisilicon.com>
|
||||||
|
Description: (RW) Controls the weight of Tx posted TLPs, which influence the
|
||||||
|
proportion of outbound posted TLPs on the PCIe link.
|
||||||
|
The available tune data is [0, 1, 2]. Writing a negative value
|
||||||
|
will return an error, and out of range values will be converted
|
||||||
|
to 2. The value indicates a probable level of the event.
|
||||||
|
|
||||||
|
What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/rx_alloc_buf_level
|
||||||
|
Date: October 2022
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: Yicong Yang <yangyicong@hisilicon.com>
|
||||||
|
Description: (RW) Control the allocated buffer watermark for inbound packets.
|
||||||
|
The packets will be stored in the buffer first and then transmitted
|
||||||
|
either when the watermark reached or when timed out.
|
||||||
|
The available tune data is [0, 1, 2]. Writing a negative value
|
||||||
|
will return an error, and out of range values will be converted
|
||||||
|
to 2. The value indicates a probable level of the event.
|
||||||
|
|
||||||
|
What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/tx_alloc_buf_level
|
||||||
|
Date: October 2022
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: Yicong Yang <yangyicong@hisilicon.com>
|
||||||
|
Description: (RW) Control the allocated buffer watermark of outbound packets.
|
||||||
|
The packets will be stored in the buffer first and then transmitted
|
||||||
|
either when the watermark reached or when timed out.
|
||||||
|
The available tune data is [0, 1, 2]. Writing a negative value
|
||||||
|
will return an error, and out of range values will be converted
|
||||||
|
to 2. The value indicates a probable level of the event.
|
||||||
@@ -296,7 +296,7 @@ Description: Processor frequency boosting control
|
|||||||
|
|
||||||
This switch controls the boost setting for the whole system.
|
This switch controls the boost setting for the whole system.
|
||||||
Boosting allows the CPU and the firmware to run at a frequency
|
Boosting allows the CPU and the firmware to run at a frequency
|
||||||
beyond it's nominal limit.
|
beyond its nominal limit.
|
||||||
|
|
||||||
More details can be found in
|
More details can be found in
|
||||||
Documentation/admin-guide/pm/cpufreq.rst
|
Documentation/admin-guide/pm/cpufreq.rst
|
||||||
@@ -523,6 +523,7 @@ What: /sys/devices/system/cpu/vulnerabilities
|
|||||||
/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
|
/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
|
||||||
/sys/devices/system/cpu/vulnerabilities/itlb_multihit
|
/sys/devices/system/cpu/vulnerabilities/itlb_multihit
|
||||||
/sys/devices/system/cpu/vulnerabilities/mmio_stale_data
|
/sys/devices/system/cpu/vulnerabilities/mmio_stale_data
|
||||||
|
/sys/devices/system/cpu/vulnerabilities/retbleed
|
||||||
Date: January 2018
|
Date: January 2018
|
||||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||||
Description: Information about CPU vulnerabilities
|
Description: Information about CPU vulnerabilities
|
||||||
|
|||||||
8
Documentation/ABI/testing/sysfs-devices-vfio-dev
Normal file
8
Documentation/ABI/testing/sysfs-devices-vfio-dev
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
What: /sys/.../<device>/vfio-dev/vfioX/
|
||||||
|
Date: September 2022
|
||||||
|
Contact: Yi Liu <yi.l.liu@intel.com>
|
||||||
|
Description:
|
||||||
|
This directory is created when the device is bound to a
|
||||||
|
vfio driver. The layout under this directory matches what
|
||||||
|
exists for a standard 'struct device'. 'X' is a unique
|
||||||
|
index marking this device in vfio.
|
||||||
@@ -16,7 +16,7 @@ Description: Version of the application running on the device's CPU
|
|||||||
|
|
||||||
What: /sys/class/habanalabs/hl<n>/clk_max_freq_mhz
|
What: /sys/class/habanalabs/hl<n>/clk_max_freq_mhz
|
||||||
Date: Jun 2019
|
Date: Jun 2019
|
||||||
KernelVersion: not yet upstreamed
|
KernelVersion: 5.7
|
||||||
Contact: ogabbay@kernel.org
|
Contact: ogabbay@kernel.org
|
||||||
Description: Allows the user to set the maximum clock frequency, in MHz.
|
Description: Allows the user to set the maximum clock frequency, in MHz.
|
||||||
The device clock might be set to lower value than the maximum.
|
The device clock might be set to lower value than the maximum.
|
||||||
@@ -26,7 +26,7 @@ Description: Allows the user to set the maximum clock frequency, in MHz.
|
|||||||
|
|
||||||
What: /sys/class/habanalabs/hl<n>/clk_cur_freq_mhz
|
What: /sys/class/habanalabs/hl<n>/clk_cur_freq_mhz
|
||||||
Date: Jun 2019
|
Date: Jun 2019
|
||||||
KernelVersion: not yet upstreamed
|
KernelVersion: 5.7
|
||||||
Contact: ogabbay@kernel.org
|
Contact: ogabbay@kernel.org
|
||||||
Description: Displays the current frequency, in MHz, of the device clock.
|
Description: Displays the current frequency, in MHz, of the device clock.
|
||||||
This property is valid only for the Gaudi ASIC family
|
This property is valid only for the Gaudi ASIC family
|
||||||
@@ -176,6 +176,12 @@ KernelVersion: 5.1
|
|||||||
Contact: ogabbay@kernel.org
|
Contact: ogabbay@kernel.org
|
||||||
Description: Version of the device's preboot F/W code
|
Description: Version of the device's preboot F/W code
|
||||||
|
|
||||||
|
What: /sys/class/habanalabs/hl<n>/security_enabled
|
||||||
|
Date: Oct 2022
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: obitton@habana.ai
|
||||||
|
Description: Displays the device's security status
|
||||||
|
|
||||||
What: /sys/class/habanalabs/hl<n>/soft_reset
|
What: /sys/class/habanalabs/hl<n>/soft_reset
|
||||||
Date: Jan 2019
|
Date: Jan 2019
|
||||||
KernelVersion: 5.1
|
KernelVersion: 5.1
|
||||||
@@ -230,6 +236,6 @@ Description: Version of the u-boot running on the device's CPU
|
|||||||
|
|
||||||
What: /sys/class/habanalabs/hl<n>/vrm_ver
|
What: /sys/class/habanalabs/hl<n>/vrm_ver
|
||||||
Date: Jan 2022
|
Date: Jan 2022
|
||||||
KernelVersion: not yet upstreamed
|
KernelVersion: 5.17
|
||||||
Contact: ogabbay@kernel.org
|
Contact: ogabbay@kernel.org
|
||||||
Description: Version of the Device's Voltage Regulator Monitor F/W code. N/A to GOYA and GAUDI
|
Description: Version of the Device's Voltage Regulator Monitor F/W code. N/A to GOYA and GAUDI
|
||||||
|
|||||||
@@ -1417,6 +1417,15 @@ Description: This node is used to set or display whether UFS WriteBooster is
|
|||||||
platform that doesn't support UFSHCD_CAP_CLK_SCALING, we can
|
platform that doesn't support UFSHCD_CAP_CLK_SCALING, we can
|
||||||
disable/enable WriteBooster through this sysfs node.
|
disable/enable WriteBooster through this sysfs node.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/drivers/ufshcd/*/enable_wb_buf_flush
|
||||||
|
What: /sys/bus/platform/devices/*.ufs/enable_wb_buf_flush
|
||||||
|
Date: July 2022
|
||||||
|
Contact: Jinyoung Choi <j-young.choi@samsung.com>
|
||||||
|
Description: This entry shows the status of WriteBooster buffer flushing
|
||||||
|
and it can be used to enable or disable the flushing.
|
||||||
|
If flushing is enabled, the device executes the flush
|
||||||
|
operation when the command queue is empty.
|
||||||
|
|
||||||
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/hpb_version
|
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/hpb_version
|
||||||
What: /sys/bus/platform/devices/*.ufs/device_descriptor/hpb_version
|
What: /sys/bus/platform/devices/*.ufs/device_descriptor/hpb_version
|
||||||
Date: June 2021
|
Date: June 2021
|
||||||
@@ -1591,6 +1600,43 @@ Description: This entry shows the status of HPB.
|
|||||||
|
|
||||||
The file is read only.
|
The file is read only.
|
||||||
|
|
||||||
|
Contact: Daniil Lunev <dlunev@chromium.org>
|
||||||
|
What: /sys/bus/platform/drivers/ufshcd/*/capabilities/
|
||||||
|
What: /sys/bus/platform/devices/*.ufs/capabilities/
|
||||||
|
Date: August 2022
|
||||||
|
Description: The group represents the effective capabilities of the
|
||||||
|
host-device pair. i.e. the capabilities which are enabled in the
|
||||||
|
driver for the specific host controller, supported by the host
|
||||||
|
controller and are supported and/or have compatible
|
||||||
|
configuration on the device side.
|
||||||
|
|
||||||
|
Contact: Daniil Lunev <dlunev@chromium.org>
|
||||||
|
What: /sys/bus/platform/drivers/ufshcd/*/capabilities/clock_scaling
|
||||||
|
What: /sys/bus/platform/devices/*.ufs/capabilities/clock_scaling
|
||||||
|
Date: August 2022
|
||||||
|
Contact: Daniil Lunev <dlunev@chromium.org>
|
||||||
|
Description: Indicates status of clock scaling.
|
||||||
|
|
||||||
|
== ============================
|
||||||
|
0 Clock scaling is not supported.
|
||||||
|
1 Clock scaling is supported.
|
||||||
|
== ============================
|
||||||
|
|
||||||
|
The file is read only.
|
||||||
|
|
||||||
|
What: /sys/bus/platform/drivers/ufshcd/*/capabilities/write_booster
|
||||||
|
What: /sys/bus/platform/devices/*.ufs/capabilities/write_booster
|
||||||
|
Date: August 2022
|
||||||
|
Contact: Daniil Lunev <dlunev@chromium.org>
|
||||||
|
Description: Indicates status of Write Booster.
|
||||||
|
|
||||||
|
== ============================
|
||||||
|
0 Write Booster can not be enabled.
|
||||||
|
1 Write Booster can be enabled.
|
||||||
|
== ============================
|
||||||
|
|
||||||
|
The file is read only.
|
||||||
|
|
||||||
What: /sys/class/scsi_device/*/device/hpb_param_sysfs/activation_thld
|
What: /sys/class/scsi_device/*/device/hpb_param_sysfs/activation_thld
|
||||||
Date: February 2021
|
Date: February 2021
|
||||||
Contact: Avri Altman <avri.altman@wdc.com>
|
Contact: Avri Altman <avri.altman@wdc.com>
|
||||||
|
|||||||
@@ -466,6 +466,30 @@ Description: Show status of f2fs superblock in real time.
|
|||||||
0x4000 SBI_IS_FREEZING freefs is in process
|
0x4000 SBI_IS_FREEZING freefs is in process
|
||||||
====== ===================== =================================
|
====== ===================== =================================
|
||||||
|
|
||||||
|
What: /sys/fs/f2fs/<disk>/stat/cp_status
|
||||||
|
Date: September 2022
|
||||||
|
Contact: "Chao Yu" <chao.yu@oppo.com>
|
||||||
|
Description: Show status of f2fs checkpoint in real time.
|
||||||
|
|
||||||
|
=============================== ==============================
|
||||||
|
cp flag value
|
||||||
|
CP_UMOUNT_FLAG 0x00000001
|
||||||
|
CP_ORPHAN_PRESENT_FLAG 0x00000002
|
||||||
|
CP_COMPACT_SUM_FLAG 0x00000004
|
||||||
|
CP_ERROR_FLAG 0x00000008
|
||||||
|
CP_FSCK_FLAG 0x00000010
|
||||||
|
CP_FASTBOOT_FLAG 0x00000020
|
||||||
|
CP_CRC_RECOVERY_FLAG 0x00000040
|
||||||
|
CP_NAT_BITS_FLAG 0x00000080
|
||||||
|
CP_TRIMMED_FLAG 0x00000100
|
||||||
|
CP_NOCRC_RECOVERY_FLAG 0x00000200
|
||||||
|
CP_LARGE_NAT_BITMAP_FLAG 0x00000400
|
||||||
|
CP_QUOTA_NEED_FSCK_FLAG 0x00000800
|
||||||
|
CP_DISABLED_FLAG 0x00001000
|
||||||
|
CP_DISABLED_QUICK_FLAG 0x00002000
|
||||||
|
CP_RESIZEFS_FLAG 0x00004000
|
||||||
|
=============================== ==============================
|
||||||
|
|
||||||
What: /sys/fs/f2fs/<disk>/ckpt_thread_ioprio
|
What: /sys/fs/f2fs/<disk>/ckpt_thread_ioprio
|
||||||
Date: January 2021
|
Date: January 2021
|
||||||
Contact: "Daeho Jeong" <daehojeong@google.com>
|
Contact: "Daeho Jeong" <daehojeong@google.com>
|
||||||
|
|||||||
@@ -55,6 +55,14 @@ Description:
|
|||||||
The object directory contains subdirectories for each function
|
The object directory contains subdirectories for each function
|
||||||
that is patched within the object.
|
that is patched within the object.
|
||||||
|
|
||||||
|
What: /sys/kernel/livepatch/<patch>/<object>/patched
|
||||||
|
Date: August 2022
|
||||||
|
KernelVersion: 6.1.0
|
||||||
|
Contact: live-patching@vger.kernel.org
|
||||||
|
Description:
|
||||||
|
An attribute which indicates whether the object is currently
|
||||||
|
patched.
|
||||||
|
|
||||||
What: /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
|
What: /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
|
||||||
Date: Nov 2014
|
Date: Nov 2014
|
||||||
KernelVersion: 3.19.0
|
KernelVersion: 3.19.0
|
||||||
|
|||||||
25
Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
Normal file
25
Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
What: /sys/devices/virtual/memory_tiering/
|
||||||
|
Date: August 2022
|
||||||
|
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||||
|
Description: A collection of all the memory tiers allocated.
|
||||||
|
|
||||||
|
Individual memory tier details are contained in subdirectories
|
||||||
|
named by the abstract distance of the memory tier.
|
||||||
|
|
||||||
|
/sys/devices/virtual/memory_tiering/memory_tierN/
|
||||||
|
|
||||||
|
|
||||||
|
What: /sys/devices/virtual/memory_tiering/memory_tierN/
|
||||||
|
/sys/devices/virtual/memory_tiering/memory_tierN/nodes
|
||||||
|
Date: August 2022
|
||||||
|
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||||
|
Description: Directory with details of a specific memory tier
|
||||||
|
|
||||||
|
This is the directory containing information about a particular
|
||||||
|
memory tier, memtierN, where N is derived based on abstract distance.
|
||||||
|
|
||||||
|
A smaller value of N implies a higher (faster) memory tier in the
|
||||||
|
hierarchy.
|
||||||
|
|
||||||
|
nodes: NUMA nodes that are part of this memory tier.
|
||||||
|
|
||||||
@@ -57,3 +57,44 @@ Description:
|
|||||||
* 0 - default,
|
* 0 - default,
|
||||||
* 1 - overboost,
|
* 1 - overboost,
|
||||||
* 2 - silent
|
* 2 - silent
|
||||||
|
|
||||||
|
What: /sys/devices/platform/<platform>/gpu_mux_mode
|
||||||
|
Date: Aug 2022
|
||||||
|
KernelVersion: 6.1
|
||||||
|
Contact: "Luke Jones" <luke@ljones.dev>
|
||||||
|
Description:
|
||||||
|
Switch the GPU hardware MUX mode. Laptops with this feature can
|
||||||
|
can be toggled to boot with only the dGPU (discrete mode) or in
|
||||||
|
standard Optimus/Hybrid mode. On switch a reboot is required:
|
||||||
|
|
||||||
|
* 0 - Discrete GPU,
|
||||||
|
* 1 - Optimus/Hybrid,
|
||||||
|
|
||||||
|
What: /sys/devices/platform/<platform>/dgpu_disable
|
||||||
|
Date: Aug 2022
|
||||||
|
KernelVersion: 5.17
|
||||||
|
Contact: "Luke Jones" <luke@ljones.dev>
|
||||||
|
Description:
|
||||||
|
Disable discrete GPU:
|
||||||
|
* 0 - Enable dGPU,
|
||||||
|
* 1 - Disable dGPU
|
||||||
|
|
||||||
|
What: /sys/devices/platform/<platform>/egpu_enable
|
||||||
|
Date: Aug 2022
|
||||||
|
KernelVersion: 5.17
|
||||||
|
Contact: "Luke Jones" <luke@ljones.dev>
|
||||||
|
Description:
|
||||||
|
Enable the external GPU paired with ROG X-Flow laptops.
|
||||||
|
Toggling this setting will also trigger ACPI to disable the dGPU:
|
||||||
|
|
||||||
|
* 0 - Disable,
|
||||||
|
* 1 - Enable
|
||||||
|
|
||||||
|
What: /sys/devices/platform/<platform>/panel_od
|
||||||
|
Date: Aug 2022
|
||||||
|
KernelVersion: 5.17
|
||||||
|
Contact: "Luke Jones" <luke@ljones.dev>
|
||||||
|
Description:
|
||||||
|
Enable an LCD response-time boost to reduce or remove ghosting:
|
||||||
|
* 0 - Disable,
|
||||||
|
* 1 - Enable
|
||||||
|
|||||||
15
Documentation/ABI/testing/sysfs-platform-brcmstb-memc
Normal file
15
Documentation/ABI/testing/sysfs-platform-brcmstb-memc
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
What: /sys/bus/platform/devices/*/srpd
|
||||||
|
Date: July 2022
|
||||||
|
KernelVersion: 5.21
|
||||||
|
Contact: Florian Fainelli <f.fainelli@gmail.com>
|
||||||
|
Description:
|
||||||
|
Self Refresh Power Down (SRPD) inactivity timeout counted in
|
||||||
|
internal DDR controller clock cycles. Possible values range
|
||||||
|
from 0 (disable inactivity timeout) to 65535 (0xffff).
|
||||||
|
|
||||||
|
What: /sys/bus/platform/devices/*/frequency
|
||||||
|
Date: July 2022
|
||||||
|
KernelVersion: 5.21
|
||||||
|
Contact: Florian Fainelli <f.fainelli@gmail.com>
|
||||||
|
Description:
|
||||||
|
DDR PHY frequency in Hz.
|
||||||
@@ -2,8 +2,8 @@ What: /sys/bus/platform/devices/ci_hdrc.0/role
|
|||||||
Date: Mar 2017
|
Date: Mar 2017
|
||||||
Contact: Peter Chen <peter.chen@nxp.com>
|
Contact: Peter Chen <peter.chen@nxp.com>
|
||||||
Description:
|
Description:
|
||||||
It returns string "gadget" or "host" when read it, it indicates
|
When read, it returns string "gadget" or "host", indicating
|
||||||
current controller role.
|
the current controller role.
|
||||||
|
|
||||||
It will do role switch when write "gadget" or "host" to it.
|
It will do role switch when "gadget" or "host" is written to it.
|
||||||
Only controller at dual-role configuration supports writing.
|
Only controller at dual-role configuration supports writing.
|
||||||
|
|||||||
@@ -152,7 +152,7 @@ Description:
|
|||||||
case further investigation is required to determine which
|
case further investigation is required to determine which
|
||||||
device is causing the problem. Note that genuine RTC clock
|
device is causing the problem. Note that genuine RTC clock
|
||||||
values (such as when pm_trace has not been used), can still
|
values (such as when pm_trace has not been used), can still
|
||||||
match a device and output it's name here.
|
match a device and output its name here.
|
||||||
|
|
||||||
What: /sys/power/pm_async
|
What: /sys/power/pm_async
|
||||||
Date: January 2009
|
Date: January 2009
|
||||||
|
|||||||
@@ -66,8 +66,13 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
As a rough rule of thumb, any dereference of an RCU-protected
|
As a rough rule of thumb, any dereference of an RCU-protected
|
||||||
pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(),
|
pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(),
|
||||||
rcu_read_lock_sched(), or by the appropriate update-side lock.
|
rcu_read_lock_sched(), or by the appropriate update-side lock.
|
||||||
Disabling of preemption can serve as rcu_read_lock_sched(), but
|
Explicit disabling of preemption (preempt_disable(), for example)
|
||||||
is less readable and prevents lockdep from detecting locking issues.
|
can serve as rcu_read_lock_sched(), but is less readable and
|
||||||
|
prevents lockdep from detecting locking issues.
|
||||||
|
|
||||||
|
Please not that you *cannot* rely on code known to be built
|
||||||
|
only in non-preemptible kernels. Such code can and will break,
|
||||||
|
especially in kernels built with CONFIG_PREEMPT_COUNT=y.
|
||||||
|
|
||||||
Letting RCU-protected pointers "leak" out of an RCU read-side
|
Letting RCU-protected pointers "leak" out of an RCU read-side
|
||||||
critical section is every bit as bad as letting them leak out
|
critical section is every bit as bad as letting them leak out
|
||||||
@@ -185,6 +190,9 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
|
|
||||||
5. If call_rcu() or call_srcu() is used, the callback function will
|
5. If call_rcu() or call_srcu() is used, the callback function will
|
||||||
be called from softirq context. In particular, it cannot block.
|
be called from softirq context. In particular, it cannot block.
|
||||||
|
If you need the callback to block, run that code in a workqueue
|
||||||
|
handler scheduled from the callback. The queue_rcu_work()
|
||||||
|
function does this for you in the case of call_rcu().
|
||||||
|
|
||||||
6. Since synchronize_rcu() can block, it cannot be called
|
6. Since synchronize_rcu() can block, it cannot be called
|
||||||
from any sort of irq context. The same rule applies
|
from any sort of irq context. The same rule applies
|
||||||
@@ -297,7 +305,8 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
the machine.
|
the machine.
|
||||||
|
|
||||||
d. Periodically invoke synchronize_rcu(), permitting a limited
|
d. Periodically invoke synchronize_rcu(), permitting a limited
|
||||||
number of updates per grace period.
|
number of updates per grace period. Better yet, periodically
|
||||||
|
invoke rcu_barrier() to wait for all outstanding callbacks.
|
||||||
|
|
||||||
The same cautions apply to call_srcu() and kfree_rcu().
|
The same cautions apply to call_srcu() and kfree_rcu().
|
||||||
|
|
||||||
@@ -477,6 +486,6 @@ over a rather long period of time, but improvements are always welcome!
|
|||||||
So if you need to wait for both an RCU grace period and for
|
So if you need to wait for both an RCU grace period and for
|
||||||
all pre-existing call_rcu() callbacks, you will need to execute
|
all pre-existing call_rcu() callbacks, you will need to execute
|
||||||
both rcu_barrier() and synchronize_rcu(), if necessary, using
|
both rcu_barrier() and synchronize_rcu(), if necessary, using
|
||||||
something like workqueues to to execute them concurrently.
|
something like workqueues to execute them concurrently.
|
||||||
|
|
||||||
See rcubarrier.rst for more information.
|
See rcubarrier.rst for more information.
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ checking of rcu_dereference() primitives:
|
|||||||
rcu_access_pointer(p):
|
rcu_access_pointer(p):
|
||||||
Return the value of the pointer and omit all barriers,
|
Return the value of the pointer and omit all barriers,
|
||||||
but retain the compiler constraints that prevent duplicating
|
but retain the compiler constraints that prevent duplicating
|
||||||
or coalescsing. This is useful when when testing the
|
or coalescsing. This is useful when testing the
|
||||||
value of the pointer itself, for example, against NULL.
|
value of the pointer itself, for example, against NULL.
|
||||||
|
|
||||||
The rcu_dereference_check() check expression can be any boolean
|
The rcu_dereference_check() check expression can be any boolean
|
||||||
|
|||||||
@@ -128,10 +128,16 @@ Follow these rules to keep your RCU code working properly:
|
|||||||
This sort of comparison occurs frequently when scanning
|
This sort of comparison occurs frequently when scanning
|
||||||
RCU-protected circular linked lists.
|
RCU-protected circular linked lists.
|
||||||
|
|
||||||
Note that if checks for being within an RCU read-side
|
Note that if the pointer comparison is done outside
|
||||||
critical section are not required and the pointer is never
|
of an RCU read-side critical section, and the pointer
|
||||||
dereferenced, rcu_access_pointer() should be used in place
|
is never dereferenced, rcu_access_pointer() should be
|
||||||
of rcu_dereference().
|
used in place of rcu_dereference(). In most cases,
|
||||||
|
it is best to avoid accidental dereferences by testing
|
||||||
|
the rcu_access_pointer() return value directly, without
|
||||||
|
assigning it to a variable.
|
||||||
|
|
||||||
|
Within an RCU read-side critical section, there is little
|
||||||
|
reason to use rcu_access_pointer().
|
||||||
|
|
||||||
- The comparison is against a pointer that references memory
|
- The comparison is against a pointer that references memory
|
||||||
that was initialized "a long time ago." The reason
|
that was initialized "a long time ago." The reason
|
||||||
|
|||||||
@@ -6,13 +6,15 @@ What is RCU? -- "Read, Copy, Update"
|
|||||||
Please note that the "What is RCU?" LWN series is an excellent place
|
Please note that the "What is RCU?" LWN series is an excellent place
|
||||||
to start learning about RCU:
|
to start learning about RCU:
|
||||||
|
|
||||||
| 1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
|
| 1. What is RCU, Fundamentally? https://lwn.net/Articles/262464/
|
||||||
| 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
|
| 2. What is RCU? Part 2: Usage https://lwn.net/Articles/263130/
|
||||||
| 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/
|
| 3. RCU part 3: the RCU API https://lwn.net/Articles/264090/
|
||||||
| 4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/
|
| 4. The RCU API, 2010 Edition https://lwn.net/Articles/418853/
|
||||||
| 2010 Big API Table http://lwn.net/Articles/419086/
|
| 2010 Big API Table https://lwn.net/Articles/419086/
|
||||||
| 5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/
|
| 5. The RCU API, 2014 Edition https://lwn.net/Articles/609904/
|
||||||
| 2014 Big API Table http://lwn.net/Articles/609973/
|
| 2014 Big API Table https://lwn.net/Articles/609973/
|
||||||
|
| 6. The RCU API, 2019 Edition https://lwn.net/Articles/777036/
|
||||||
|
| 2019 Big API Table https://lwn.net/Articles/777165/
|
||||||
|
|
||||||
|
|
||||||
What is RCU?
|
What is RCU?
|
||||||
@@ -915,13 +917,18 @@ which an RCU reference is held include:
|
|||||||
The understanding that RCU provides a reference that only prevents a
|
The understanding that RCU provides a reference that only prevents a
|
||||||
change of type is particularly visible with objects allocated from a
|
change of type is particularly visible with objects allocated from a
|
||||||
slab cache marked ``SLAB_TYPESAFE_BY_RCU``. RCU operations may yield a
|
slab cache marked ``SLAB_TYPESAFE_BY_RCU``. RCU operations may yield a
|
||||||
reference to an object from such a cache that has been concurrently
|
reference to an object from such a cache that has been concurrently freed
|
||||||
freed and the memory reallocated to a completely different object,
|
and the memory reallocated to a completely different object, though of
|
||||||
though of the same type. In this case RCU doesn't even protect the
|
the same type. In this case RCU doesn't even protect the identity of the
|
||||||
identity of the object from changing, only its type. So the object
|
object from changing, only its type. So the object found may not be the
|
||||||
found may not be the one expected, but it will be one where it is safe
|
one expected, but it will be one where it is safe to take a reference
|
||||||
to take a reference or spinlock and then confirm that the identity
|
(and then potentially acquiring a spinlock), allowing subsequent code
|
||||||
matches the expectations.
|
to check whether the identity matches expectations. It is tempting
|
||||||
|
to simply acquire the spinlock without first taking the reference, but
|
||||||
|
unfortunately any spinlock in a ``SLAB_TYPESAFE_BY_RCU`` object must be
|
||||||
|
initialized after each and every call to kmem_cache_alloc(), which renders
|
||||||
|
reference-free spinlock acquisition completely unsafe. Therefore, when
|
||||||
|
using ``SLAB_TYPESAFE_BY_RCU``, make proper use of a reference counter.
|
||||||
|
|
||||||
With traditional reference counting -- such as that implemented by the
|
With traditional reference counting -- such as that implemented by the
|
||||||
kref library in Linux -- there is typically code that runs when the last
|
kref library in Linux -- there is typically code that runs when the last
|
||||||
@@ -1057,14 +1064,20 @@ SRCU: Initialization/cleanup::
|
|||||||
init_srcu_struct
|
init_srcu_struct
|
||||||
cleanup_srcu_struct
|
cleanup_srcu_struct
|
||||||
|
|
||||||
All: lockdep-checked RCU-protected pointer access::
|
All: lockdep-checked RCU utility APIs::
|
||||||
|
|
||||||
rcu_access_pointer
|
|
||||||
rcu_dereference_raw
|
|
||||||
RCU_LOCKDEP_WARN
|
RCU_LOCKDEP_WARN
|
||||||
rcu_sleep_check
|
rcu_sleep_check
|
||||||
RCU_NONIDLE
|
RCU_NONIDLE
|
||||||
|
|
||||||
|
All: Unchecked RCU-protected pointer access::
|
||||||
|
|
||||||
|
rcu_dereference_raw
|
||||||
|
|
||||||
|
All: Unchecked RCU-protected pointer access with dereferencing prohibited::
|
||||||
|
|
||||||
|
rcu_access_pointer
|
||||||
|
|
||||||
See the comment headers in the source code (or the docbook generated
|
See the comment headers in the source code (or the docbook generated
|
||||||
from them) for more information.
|
from them) for more information.
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ a) waiting for a CPU (while being runnable)
|
|||||||
b) completion of synchronous block I/O initiated by the task
|
b) completion of synchronous block I/O initiated by the task
|
||||||
c) swapping in pages
|
c) swapping in pages
|
||||||
d) memory reclaim
|
d) memory reclaim
|
||||||
e) thrashing page cache
|
e) thrashing
|
||||||
f) direct compact
|
f) direct compact
|
||||||
g) write-protect copy
|
g) write-protect copy
|
||||||
|
|
||||||
|
|||||||
@@ -1,9 +1,9 @@
|
|||||||
.. _readme:
|
.. _readme:
|
||||||
|
|
||||||
Linux kernel release 5.x <http://kernel.org/>
|
Linux kernel release 6.x <http://kernel.org/>
|
||||||
=============================================
|
=============================================
|
||||||
|
|
||||||
These are the release notes for Linux version 5. Read them carefully,
|
These are the release notes for Linux version 6. Read them carefully,
|
||||||
as they tell you what this is all about, explain how to install the
|
as they tell you what this is all about, explain how to install the
|
||||||
kernel, and what to do if something goes wrong.
|
kernel, and what to do if something goes wrong.
|
||||||
|
|
||||||
@@ -63,7 +63,7 @@ Installing the kernel source
|
|||||||
directory where you have permissions (e.g. your home directory) and
|
directory where you have permissions (e.g. your home directory) and
|
||||||
unpack it::
|
unpack it::
|
||||||
|
|
||||||
xz -cd linux-5.x.tar.xz | tar xvf -
|
xz -cd linux-6.x.tar.xz | tar xvf -
|
||||||
|
|
||||||
Replace "X" with the version number of the latest kernel.
|
Replace "X" with the version number of the latest kernel.
|
||||||
|
|
||||||
@@ -72,12 +72,12 @@ Installing the kernel source
|
|||||||
files. They should match the library, and not get messed up by
|
files. They should match the library, and not get messed up by
|
||||||
whatever the kernel-du-jour happens to be.
|
whatever the kernel-du-jour happens to be.
|
||||||
|
|
||||||
- You can also upgrade between 5.x releases by patching. Patches are
|
- You can also upgrade between 6.x releases by patching. Patches are
|
||||||
distributed in the xz format. To install by patching, get all the
|
distributed in the xz format. To install by patching, get all the
|
||||||
newer patch files, enter the top level directory of the kernel source
|
newer patch files, enter the top level directory of the kernel source
|
||||||
(linux-5.x) and execute::
|
(linux-6.x) and execute::
|
||||||
|
|
||||||
xz -cd ../patch-5.x.xz | patch -p1
|
xz -cd ../patch-6.x.xz | patch -p1
|
||||||
|
|
||||||
Replace "x" for all versions bigger than the version "x" of your current
|
Replace "x" for all versions bigger than the version "x" of your current
|
||||||
source tree, **in_order**, and you should be ok. You may want to remove
|
source tree, **in_order**, and you should be ok. You may want to remove
|
||||||
@@ -85,13 +85,13 @@ Installing the kernel source
|
|||||||
that there are no failed patches (some-file-name# or some-file-name.rej).
|
that there are no failed patches (some-file-name# or some-file-name.rej).
|
||||||
If there are, either you or I have made a mistake.
|
If there are, either you or I have made a mistake.
|
||||||
|
|
||||||
Unlike patches for the 5.x kernels, patches for the 5.x.y kernels
|
Unlike patches for the 6.x kernels, patches for the 6.x.y kernels
|
||||||
(also known as the -stable kernels) are not incremental but instead apply
|
(also known as the -stable kernels) are not incremental but instead apply
|
||||||
directly to the base 5.x kernel. For example, if your base kernel is 5.0
|
directly to the base 6.x kernel. For example, if your base kernel is 6.0
|
||||||
and you want to apply the 5.0.3 patch, you must not first apply the 5.0.1
|
and you want to apply the 6.0.3 patch, you must not first apply the 6.0.1
|
||||||
and 5.0.2 patches. Similarly, if you are running kernel version 5.0.2 and
|
and 6.0.2 patches. Similarly, if you are running kernel version 6.0.2 and
|
||||||
want to jump to 5.0.3, you must first reverse the 5.0.2 patch (that is,
|
want to jump to 6.0.3, you must first reverse the 6.0.2 patch (that is,
|
||||||
patch -R) **before** applying the 5.0.3 patch. You can read more on this in
|
patch -R) **before** applying the 6.0.3 patch. You can read more on this in
|
||||||
:ref:`Documentation/process/applying-patches.rst <applying_patches>`.
|
:ref:`Documentation/process/applying-patches.rst <applying_patches>`.
|
||||||
|
|
||||||
Alternatively, the script patch-kernel can be used to automate this
|
Alternatively, the script patch-kernel can be used to automate this
|
||||||
@@ -114,7 +114,7 @@ Installing the kernel source
|
|||||||
Software requirements
|
Software requirements
|
||||||
---------------------
|
---------------------
|
||||||
|
|
||||||
Compiling and running the 5.x kernels requires up-to-date
|
Compiling and running the 6.x kernels requires up-to-date
|
||||||
versions of various software packages. Consult
|
versions of various software packages. Consult
|
||||||
:ref:`Documentation/process/changes.rst <changes>` for the minimum version numbers
|
:ref:`Documentation/process/changes.rst <changes>` for the minimum version numbers
|
||||||
required and how to get updates for these packages. Beware that using
|
required and how to get updates for these packages. Beware that using
|
||||||
@@ -132,12 +132,12 @@ Build directory for the kernel
|
|||||||
place for the output files (including .config).
|
place for the output files (including .config).
|
||||||
Example::
|
Example::
|
||||||
|
|
||||||
kernel source code: /usr/src/linux-5.x
|
kernel source code: /usr/src/linux-6.x
|
||||||
build directory: /home/name/build/kernel
|
build directory: /home/name/build/kernel
|
||||||
|
|
||||||
To configure and build the kernel, use::
|
To configure and build the kernel, use::
|
||||||
|
|
||||||
cd /usr/src/linux-5.x
|
cd /usr/src/linux-6.x
|
||||||
make O=/home/name/build/kernel menuconfig
|
make O=/home/name/build/kernel menuconfig
|
||||||
make O=/home/name/build/kernel
|
make O=/home/name/build/kernel
|
||||||
sudo make O=/home/name/build/kernel modules_install install
|
sudo make O=/home/name/build/kernel modules_install install
|
||||||
@@ -262,8 +262,6 @@ Compiling the kernel
|
|||||||
- Make sure you have at least gcc 5.1 available.
|
- Make sure you have at least gcc 5.1 available.
|
||||||
For more information, refer to :ref:`Documentation/process/changes.rst <changes>`.
|
For more information, refer to :ref:`Documentation/process/changes.rst <changes>`.
|
||||||
|
|
||||||
Please note that you can still run a.out user programs with this kernel.
|
|
||||||
|
|
||||||
- Do a ``make`` to create a compressed kernel image. It is also
|
- Do a ``make`` to create a compressed kernel image. It is also
|
||||||
possible to do ``make install`` if you have lilo installed to suit the
|
possible to do ``make install`` if you have lilo installed to suit the
|
||||||
kernel makefiles, but you may want to check your particular lilo setup first.
|
kernel makefiles, but you may want to check your particular lilo setup first.
|
||||||
@@ -332,85 +330,10 @@ Compiling the kernel
|
|||||||
If something goes wrong
|
If something goes wrong
|
||||||
-----------------------
|
-----------------------
|
||||||
|
|
||||||
- If you have problems that seem to be due to kernel bugs, please check
|
If you have problems that seem to be due to kernel bugs, please follow the
|
||||||
the file MAINTAINERS to see if there is a particular person associated
|
instructions at 'Documentation/admin-guide/reporting-issues.rst'.
|
||||||
with the part of the kernel that you are having trouble with. If there
|
|
||||||
isn't anyone listed there, then the second best thing is to mail
|
|
||||||
them to me (torvalds@linux-foundation.org), and possibly to any other
|
|
||||||
relevant mailing-list or to the newsgroup.
|
|
||||||
|
|
||||||
- In all bug-reports, *please* tell what kernel you are talking about,
|
Hints on understanding kernel bug reports are in
|
||||||
how to duplicate the problem, and what your setup is (use your common
|
'Documentation/admin-guide/bug-hunting.rst'. More on debugging the kernel
|
||||||
sense). If the problem is new, tell me so, and if the problem is
|
with gdb is in 'Documentation/dev-tools/gdb-kernel-debugging.rst' and
|
||||||
old, please try to tell me when you first noticed it.
|
'Documentation/dev-tools/kgdb.rst'.
|
||||||
|
|
||||||
- If the bug results in a message like::
|
|
||||||
|
|
||||||
unable to handle kernel paging request at address C0000010
|
|
||||||
Oops: 0002
|
|
||||||
EIP: 0010:XXXXXXXX
|
|
||||||
eax: xxxxxxxx ebx: xxxxxxxx ecx: xxxxxxxx edx: xxxxxxxx
|
|
||||||
esi: xxxxxxxx edi: xxxxxxxx ebp: xxxxxxxx
|
|
||||||
ds: xxxx es: xxxx fs: xxxx gs: xxxx
|
|
||||||
Pid: xx, process nr: xx
|
|
||||||
xx xx xx xx xx xx xx xx xx xx
|
|
||||||
|
|
||||||
or similar kernel debugging information on your screen or in your
|
|
||||||
system log, please duplicate it *exactly*. The dump may look
|
|
||||||
incomprehensible to you, but it does contain information that may
|
|
||||||
help debugging the problem. The text above the dump is also
|
|
||||||
important: it tells something about why the kernel dumped code (in
|
|
||||||
the above example, it's due to a bad kernel pointer). More information
|
|
||||||
on making sense of the dump is in Documentation/admin-guide/bug-hunting.rst
|
|
||||||
|
|
||||||
- If you compiled the kernel with CONFIG_KALLSYMS you can send the dump
|
|
||||||
as is, otherwise you will have to use the ``ksymoops`` program to make
|
|
||||||
sense of the dump (but compiling with CONFIG_KALLSYMS is usually preferred).
|
|
||||||
This utility can be downloaded from
|
|
||||||
https://www.kernel.org/pub/linux/utils/kernel/ksymoops/ .
|
|
||||||
Alternatively, you can do the dump lookup by hand:
|
|
||||||
|
|
||||||
- In debugging dumps like the above, it helps enormously if you can
|
|
||||||
look up what the EIP value means. The hex value as such doesn't help
|
|
||||||
me or anybody else very much: it will depend on your particular
|
|
||||||
kernel setup. What you should do is take the hex value from the EIP
|
|
||||||
line (ignore the ``0010:``), and look it up in the kernel namelist to
|
|
||||||
see which kernel function contains the offending address.
|
|
||||||
|
|
||||||
To find out the kernel function name, you'll need to find the system
|
|
||||||
binary associated with the kernel that exhibited the symptom. This is
|
|
||||||
the file 'linux/vmlinux'. To extract the namelist and match it against
|
|
||||||
the EIP from the kernel crash, do::
|
|
||||||
|
|
||||||
nm vmlinux | sort | less
|
|
||||||
|
|
||||||
This will give you a list of kernel addresses sorted in ascending
|
|
||||||
order, from which it is simple to find the function that contains the
|
|
||||||
offending address. Note that the address given by the kernel
|
|
||||||
debugging messages will not necessarily match exactly with the
|
|
||||||
function addresses (in fact, that is very unlikely), so you can't
|
|
||||||
just 'grep' the list: the list will, however, give you the starting
|
|
||||||
point of each kernel function, so by looking for the function that
|
|
||||||
has a starting address lower than the one you are searching for but
|
|
||||||
is followed by a function with a higher address you will find the one
|
|
||||||
you want. In fact, it may be a good idea to include a bit of
|
|
||||||
"context" in your problem report, giving a few lines around the
|
|
||||||
interesting one.
|
|
||||||
|
|
||||||
If you for some reason cannot do the above (you have a pre-compiled
|
|
||||||
kernel image or similar), telling me as much about your setup as
|
|
||||||
possible will help. Please read
|
|
||||||
'Documentation/admin-guide/reporting-issues.rst' for details.
|
|
||||||
|
|
||||||
- Alternatively, you can use gdb on a running kernel. (read-only; i.e. you
|
|
||||||
cannot change values or set break points.) To do this, first compile the
|
|
||||||
kernel with -g; edit arch/x86/Makefile appropriately, then do a ``make
|
|
||||||
clean``. You'll also need to enable CONFIG_PROC_FS (via ``make config``).
|
|
||||||
|
|
||||||
After you've rebooted with the new kernel, do ``gdb vmlinux /proc/kcore``.
|
|
||||||
You can now use all the usual gdb commands. The command to look up the
|
|
||||||
point where your system crashed is ``l *0xXXXXXXXX``. (Replace the XXXes
|
|
||||||
with the EIP value.)
|
|
||||||
|
|
||||||
gdb'ing a non-running kernel currently fails because ``gdb`` (wrongly)
|
|
||||||
disregards the starting offset for which the kernel is compiled.
|
|
||||||
|
|||||||
@@ -1,13 +0,0 @@
|
|||||||
.. SPDX-License-Identifier: GPL-2.0
|
|
||||||
|
|
||||||
===============
|
|
||||||
Overriding DSDT
|
|
||||||
===============
|
|
||||||
|
|
||||||
Linux supports a method of overriding the BIOS DSDT:
|
|
||||||
|
|
||||||
CONFIG_ACPI_CUSTOM_DSDT - builds the image into the kernel.
|
|
||||||
|
|
||||||
When to use this method is described in detail on the
|
|
||||||
Linux/ACPI home page:
|
|
||||||
https://01.org/linux-acpi/documentation/overriding-dsdt
|
|
||||||
@@ -299,7 +299,7 @@ Per-node-per-memcgroup LRU (cgroup's private LRU) is guarded by
|
|||||||
lruvec->lru_lock; PG_lru bit of page->flags is cleared before
|
lruvec->lru_lock; PG_lru bit of page->flags is cleared before
|
||||||
isolating a page from its LRU under lruvec->lru_lock.
|
isolating a page from its LRU under lruvec->lru_lock.
|
||||||
|
|
||||||
2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
|
2.7 Kernel Memory Extension
|
||||||
-----------------------------------------------
|
-----------------------------------------------
|
||||||
|
|
||||||
With the Kernel memory extension, the Memory Controller is able to limit
|
With the Kernel memory extension, the Memory Controller is able to limit
|
||||||
@@ -386,8 +386,6 @@ U != 0, K >= U:
|
|||||||
|
|
||||||
a. Enable CONFIG_CGROUPS
|
a. Enable CONFIG_CGROUPS
|
||||||
b. Enable CONFIG_MEMCG
|
b. Enable CONFIG_MEMCG
|
||||||
c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
|
|
||||||
d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
|
|
||||||
|
|
||||||
3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
|
3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
|
||||||
-------------------------------------------------------------------
|
-------------------------------------------------------------------
|
||||||
|
|||||||
@@ -976,6 +976,29 @@ All cgroup core files are prefixed with "cgroup."
|
|||||||
killing cgroups is a process directed operation, i.e. it affects
|
killing cgroups is a process directed operation, i.e. it affects
|
||||||
the whole thread-group.
|
the whole thread-group.
|
||||||
|
|
||||||
|
cgroup.pressure
|
||||||
|
A read-write single value file that allowed values are "0" and "1".
|
||||||
|
The default is "1".
|
||||||
|
|
||||||
|
Writing "0" to the file will disable the cgroup PSI accounting.
|
||||||
|
Writing "1" to the file will re-enable the cgroup PSI accounting.
|
||||||
|
|
||||||
|
This control attribute is not hierarchical, so disable or enable PSI
|
||||||
|
accounting in a cgroup does not affect PSI accounting in descendants
|
||||||
|
and doesn't need pass enablement via ancestors from root.
|
||||||
|
|
||||||
|
The reason this control attribute exists is that PSI accounts stalls for
|
||||||
|
each cgroup separately and aggregates it at each level of the hierarchy.
|
||||||
|
This may cause non-negligible overhead for some workloads when under
|
||||||
|
deep level of the hierarchy, in which case this control attribute can
|
||||||
|
be used to disable PSI accounting in the non-leaf cgroups.
|
||||||
|
|
||||||
|
irq.pressure
|
||||||
|
A read-write nested-keyed file.
|
||||||
|
|
||||||
|
Shows pressure stall information for IRQ/SOFTIRQ. See
|
||||||
|
:ref:`Documentation/accounting/psi.rst <psi>` for details.
|
||||||
|
|
||||||
Controllers
|
Controllers
|
||||||
===========
|
===========
|
||||||
|
|
||||||
@@ -1355,6 +1378,11 @@ PAGE_SIZE multiple when read back.
|
|||||||
pagetables
|
pagetables
|
||||||
Amount of memory allocated for page tables.
|
Amount of memory allocated for page tables.
|
||||||
|
|
||||||
|
sec_pagetables
|
||||||
|
Amount of memory allocated for secondary page tables,
|
||||||
|
this currently includes KVM mmu allocations on x86
|
||||||
|
and arm64.
|
||||||
|
|
||||||
percpu (npn)
|
percpu (npn)
|
||||||
Amount of memory used for storing per-cpu kernel
|
Amount of memory used for storing per-cpu kernel
|
||||||
data structures.
|
data structures.
|
||||||
@@ -2185,75 +2213,93 @@ Cpuset Interface Files
|
|||||||
|
|
||||||
It accepts only the following input values when written to.
|
It accepts only the following input values when written to.
|
||||||
|
|
||||||
======== ================================
|
========== =====================================
|
||||||
"root" a partition root
|
|
||||||
"member" a non-root member of a partition
|
|
||||||
======== ================================
|
|
||||||
|
|
||||||
When set to be a partition root, the current cgroup is the
|
|
||||||
root of a new partition or scheduling domain that comprises
|
|
||||||
itself and all its descendants except those that are separate
|
|
||||||
partition roots themselves and their descendants. The root
|
|
||||||
cgroup is always a partition root.
|
|
||||||
|
|
||||||
There are constraints on where a partition root can be set.
|
|
||||||
It can only be set in a cgroup if all the following conditions
|
|
||||||
are true.
|
|
||||||
|
|
||||||
1) The "cpuset.cpus" is not empty and the list of CPUs are
|
|
||||||
exclusive, i.e. they are not shared by any of its siblings.
|
|
||||||
2) The parent cgroup is a partition root.
|
|
||||||
3) The "cpuset.cpus" is also a proper subset of the parent's
|
|
||||||
"cpuset.cpus.effective".
|
|
||||||
4) There is no child cgroups with cpuset enabled. This is for
|
|
||||||
eliminating corner cases that have to be handled if such a
|
|
||||||
condition is allowed.
|
|
||||||
|
|
||||||
Setting it to partition root will take the CPUs away from the
|
|
||||||
effective CPUs of the parent cgroup. Once it is set, this
|
|
||||||
file cannot be reverted back to "member" if there are any child
|
|
||||||
cgroups with cpuset enabled.
|
|
||||||
|
|
||||||
A parent partition cannot distribute all its CPUs to its
|
|
||||||
child partitions. There must be at least one cpu left in the
|
|
||||||
parent partition.
|
|
||||||
|
|
||||||
Once becoming a partition root, changes to "cpuset.cpus" is
|
|
||||||
generally allowed as long as the first condition above is true,
|
|
||||||
the change will not take away all the CPUs from the parent
|
|
||||||
partition and the new "cpuset.cpus" value is a superset of its
|
|
||||||
children's "cpuset.cpus" values.
|
|
||||||
|
|
||||||
Sometimes, external factors like changes to ancestors'
|
|
||||||
"cpuset.cpus" or cpu hotplug can cause the state of the partition
|
|
||||||
root to change. On read, the "cpuset.sched.partition" file
|
|
||||||
can show the following values.
|
|
||||||
|
|
||||||
============== ==============================
|
|
||||||
"member" Non-root member of a partition
|
"member" Non-root member of a partition
|
||||||
"root" Partition root
|
"root" Partition root
|
||||||
"root invalid" Invalid partition root
|
"isolated" Partition root without load balancing
|
||||||
============== ==============================
|
========== =====================================
|
||||||
|
|
||||||
It is a partition root if the first 2 partition root conditions
|
The root cgroup is always a partition root and its state
|
||||||
above are true and at least one CPU from "cpuset.cpus" is
|
cannot be changed. All other non-root cgroups start out as
|
||||||
granted by the parent cgroup.
|
"member".
|
||||||
|
|
||||||
A partition root can become invalid if none of CPUs requested
|
When set to "root", the current cgroup is the root of a new
|
||||||
in "cpuset.cpus" can be granted by the parent cgroup or the
|
partition or scheduling domain that comprises itself and all
|
||||||
parent cgroup is no longer a partition root itself. In this
|
its descendants except those that are separate partition roots
|
||||||
case, it is not a real partition even though the restriction
|
themselves and their descendants.
|
||||||
of the first partition root condition above will still apply.
|
|
||||||
The cpu affinity of all the tasks in the cgroup will then be
|
|
||||||
associated with CPUs in the nearest ancestor partition.
|
|
||||||
|
|
||||||
An invalid partition root can be transitioned back to a
|
When set to "isolated", the CPUs in that partition root will
|
||||||
real partition root if at least one of the requested CPUs
|
be in an isolated state without any load balancing from the
|
||||||
can now be granted by its parent. In this case, the cpu
|
scheduler. Tasks placed in such a partition with multiple
|
||||||
affinity of all the tasks in the formerly invalid partition
|
CPUs should be carefully distributed and bound to each of the
|
||||||
will be associated to the CPUs of the newly formed partition.
|
individual CPUs for optimal performance.
|
||||||
Changing the partition state of an invalid partition root to
|
|
||||||
"member" is always allowed even if child cpusets are present.
|
The value shown in "cpuset.cpus.effective" of a partition root
|
||||||
|
is the CPUs that the partition root can dedicate to a potential
|
||||||
|
new child partition root. The new child subtracts available
|
||||||
|
CPUs from its parent "cpuset.cpus.effective".
|
||||||
|
|
||||||
|
A partition root ("root" or "isolated") can be in one of the
|
||||||
|
two possible states - valid or invalid. An invalid partition
|
||||||
|
root is in a degraded state where some state information may
|
||||||
|
be retained, but behaves more like a "member".
|
||||||
|
|
||||||
|
All possible state transitions among "member", "root" and
|
||||||
|
"isolated" are allowed.
|
||||||
|
|
||||||
|
On read, the "cpuset.cpus.partition" file can show the following
|
||||||
|
values.
|
||||||
|
|
||||||
|
============================= =====================================
|
||||||
|
"member" Non-root member of a partition
|
||||||
|
"root" Partition root
|
||||||
|
"isolated" Partition root without load balancing
|
||||||
|
"root invalid (<reason>)" Invalid partition root
|
||||||
|
"isolated invalid (<reason>)" Invalid isolated partition root
|
||||||
|
============================= =====================================
|
||||||
|
|
||||||
|
In the case of an invalid partition root, a descriptive string on
|
||||||
|
why the partition is invalid is included within parentheses.
|
||||||
|
|
||||||
|
For a partition root to become valid, the following conditions
|
||||||
|
must be met.
|
||||||
|
|
||||||
|
1) The "cpuset.cpus" is exclusive with its siblings , i.e. they
|
||||||
|
are not shared by any of its siblings (exclusivity rule).
|
||||||
|
2) The parent cgroup is a valid partition root.
|
||||||
|
3) The "cpuset.cpus" is not empty and must contain at least
|
||||||
|
one of the CPUs from parent's "cpuset.cpus", i.e. they overlap.
|
||||||
|
4) The "cpuset.cpus.effective" cannot be empty unless there is
|
||||||
|
no task associated with this partition.
|
||||||
|
|
||||||
|
External events like hotplug or changes to "cpuset.cpus" can
|
||||||
|
cause a valid partition root to become invalid and vice versa.
|
||||||
|
Note that a task cannot be moved to a cgroup with empty
|
||||||
|
"cpuset.cpus.effective".
|
||||||
|
|
||||||
|
For a valid partition root with the sibling cpu exclusivity
|
||||||
|
rule enabled, changes made to "cpuset.cpus" that violate the
|
||||||
|
exclusivity rule will invalidate the partition as well as its
|
||||||
|
sibiling partitions with conflicting cpuset.cpus values. So
|
||||||
|
care must be taking in changing "cpuset.cpus".
|
||||||
|
|
||||||
|
A valid non-root parent partition may distribute out all its CPUs
|
||||||
|
to its child partitions when there is no task associated with it.
|
||||||
|
|
||||||
|
Care must be taken to change a valid partition root to
|
||||||
|
"member" as all its child partitions, if present, will become
|
||||||
|
invalid causing disruption to tasks running in those child
|
||||||
|
partitions. These inactivated partitions could be recovered if
|
||||||
|
their parent is switched back to a partition root with a proper
|
||||||
|
set of "cpuset.cpus".
|
||||||
|
|
||||||
|
Poll and inotify events are triggered whenever the state of
|
||||||
|
"cpuset.cpus.partition" changes. That includes changes caused
|
||||||
|
by write to "cpuset.cpus.partition", cpu hotplug or other
|
||||||
|
changes that modify the validity status of the partition.
|
||||||
|
This will allow user space agents to monitor unexpected changes
|
||||||
|
to "cpuset.cpus.partition" without the need to do continuous
|
||||||
|
polling.
|
||||||
|
|
||||||
|
|
||||||
Device controller
|
Device controller
|
||||||
|
|||||||
@@ -5,143 +5,115 @@ Dynamic debug
|
|||||||
Introduction
|
Introduction
|
||||||
============
|
============
|
||||||
|
|
||||||
This document describes how to use the dynamic debug (dyndbg) feature.
|
Dynamic debug allows you to dynamically enable/disable kernel
|
||||||
|
debug-print code to obtain additional kernel information.
|
||||||
|
|
||||||
Dynamic debug is designed to allow you to dynamically enable/disable
|
If ``/proc/dynamic_debug/control`` exists, your kernel has dynamic
|
||||||
kernel code to obtain additional kernel information. Currently, if
|
debug. You'll need root access (sudo su) to use this.
|
||||||
``CONFIG_DYNAMIC_DEBUG`` is set, then all ``pr_debug()``/``dev_dbg()`` and
|
|
||||||
``print_hex_dump_debug()``/``print_hex_dump_bytes()`` calls can be dynamically
|
|
||||||
enabled per-callsite.
|
|
||||||
|
|
||||||
If you do not want to enable dynamic debug globally (i.e. in some embedded
|
Dynamic debug provides:
|
||||||
system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic
|
|
||||||
debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any
|
|
||||||
modules which you'd like to dynamically debug later.
|
|
||||||
|
|
||||||
If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is just
|
* a Catalog of all *prdbgs* in your kernel.
|
||||||
shortcut for ``print_hex_dump(KERN_DEBUG)``.
|
``cat /proc/dynamic_debug/control`` to see them.
|
||||||
|
|
||||||
For ``print_hex_dump_debug()``/``print_hex_dump_bytes()``, format string is
|
* a Simple query/command language to alter *prdbgs* by selecting on
|
||||||
its ``prefix_str`` argument, if it is constant string; or ``hexdump``
|
any combination of 0 or 1 of:
|
||||||
in case ``prefix_str`` is built dynamically.
|
|
||||||
|
|
||||||
Dynamic debug has even more useful features:
|
|
||||||
|
|
||||||
* Simple query language allows turning on and off debugging
|
|
||||||
statements by matching any combination of 0 or 1 of:
|
|
||||||
|
|
||||||
- source filename
|
- source filename
|
||||||
- function name
|
- function name
|
||||||
- line number (including ranges of line numbers)
|
- line number (including ranges of line numbers)
|
||||||
- module name
|
- module name
|
||||||
- format string
|
- format string
|
||||||
|
- class name (as known/declared by each module)
|
||||||
* Provides a debugfs control file: ``<debugfs>/dynamic_debug/control``
|
|
||||||
which can be read to display the complete list of known debug
|
|
||||||
statements, to help guide you
|
|
||||||
|
|
||||||
Controlling dynamic debug Behaviour
|
|
||||||
===================================
|
|
||||||
|
|
||||||
The behaviour of ``pr_debug()``/``dev_dbg()`` are controlled via writing to a
|
|
||||||
control file in the 'debugfs' filesystem. Thus, you must first mount
|
|
||||||
the debugfs filesystem, in order to make use of this feature.
|
|
||||||
Subsequently, we refer to the control file as:
|
|
||||||
``<debugfs>/dynamic_debug/control``. For example, if you want to enable
|
|
||||||
printing from source file ``svcsock.c``, line 1603 you simply do::
|
|
||||||
|
|
||||||
nullarbor:~ # echo 'file svcsock.c line 1603 +p' >
|
|
||||||
<debugfs>/dynamic_debug/control
|
|
||||||
|
|
||||||
If you make a mistake with the syntax, the write will fail thus::
|
|
||||||
|
|
||||||
nullarbor:~ # echo 'file svcsock.c wtf 1 +p' >
|
|
||||||
<debugfs>/dynamic_debug/control
|
|
||||||
-bash: echo: write error: Invalid argument
|
|
||||||
|
|
||||||
Note, for systems without 'debugfs' enabled, the control file can be
|
|
||||||
found in ``/proc/dynamic_debug/control``.
|
|
||||||
|
|
||||||
Viewing Dynamic Debug Behaviour
|
Viewing Dynamic Debug Behaviour
|
||||||
===============================
|
===============================
|
||||||
|
|
||||||
You can view the currently configured behaviour of all the debug
|
You can view the currently configured behaviour in the *prdbg* catalog::
|
||||||
statements via::
|
|
||||||
|
|
||||||
nullarbor:~ # cat <debugfs>/dynamic_debug/control
|
:#> head -n7 /proc/dynamic_debug/control
|
||||||
# filename:lineno [module]function flags format
|
# filename:lineno [module]function flags format
|
||||||
net/sunrpc/svc_rdma.c:323 [svcxprt_rdma]svc_rdma_cleanup =_ "SVCRDMA Module Removed, deregister RPC RDMA transport\012"
|
init/main.c:1179 [main]initcall_blacklist =_ "blacklisting initcall %s\012
|
||||||
net/sunrpc/svc_rdma.c:341 [svcxprt_rdma]svc_rdma_init =_ "\011max_inline : %d\012"
|
init/main.c:1218 [main]initcall_blacklisted =_ "initcall %s blacklisted\012"
|
||||||
net/sunrpc/svc_rdma.c:340 [svcxprt_rdma]svc_rdma_init =_ "\011sq_depth : %d\012"
|
init/main.c:1424 [main]run_init_process =_ " with arguments:\012"
|
||||||
net/sunrpc/svc_rdma.c:338 [svcxprt_rdma]svc_rdma_init =_ "\011max_requests : %d\012"
|
init/main.c:1426 [main]run_init_process =_ " %s\012"
|
||||||
...
|
init/main.c:1427 [main]run_init_process =_ " with environment:\012"
|
||||||
|
init/main.c:1429 [main]run_init_process =_ " %s\012"
|
||||||
|
|
||||||
|
The 3rd space-delimited column shows the current flags, preceded by
|
||||||
|
a ``=`` for easy use with grep/cut. ``=p`` shows enabled callsites.
|
||||||
|
|
||||||
You can also apply standard Unix text manipulation filters to this
|
Controlling dynamic debug Behaviour
|
||||||
data, e.g.::
|
===================================
|
||||||
|
|
||||||
nullarbor:~ # grep -i rdma <debugfs>/dynamic_debug/control | wc -l
|
The behaviour of *prdbg* sites are controlled by writing
|
||||||
62
|
query/commands to the control file. Example::
|
||||||
|
|
||||||
nullarbor:~ # grep -i tcp <debugfs>/dynamic_debug/control | wc -l
|
# grease the interface
|
||||||
42
|
:#> alias ddcmd='echo $* > /proc/dynamic_debug/control'
|
||||||
|
|
||||||
The third column shows the currently enabled flags for each debug
|
:#> ddcmd '-p; module main func run* +p'
|
||||||
statement callsite (see below for definitions of the flags). The
|
:#> grep =p /proc/dynamic_debug/control
|
||||||
default value, with no flags enabled, is ``=_``. So you can view all
|
init/main.c:1424 [main]run_init_process =p " with arguments:\012"
|
||||||
the debug statement callsites with any non-default flags::
|
init/main.c:1426 [main]run_init_process =p " %s\012"
|
||||||
|
init/main.c:1427 [main]run_init_process =p " with environment:\012"
|
||||||
|
init/main.c:1429 [main]run_init_process =p " %s\012"
|
||||||
|
|
||||||
nullarbor:~ # awk '$3 != "=_"' <debugfs>/dynamic_debug/control
|
Error messages go to console/syslog::
|
||||||
# filename:lineno [module]function flags format
|
|
||||||
net/sunrpc/svcsock.c:1603 [sunrpc]svc_send p "svc_process: st_sendto returned %d\012"
|
:#> ddcmd mode foo +p
|
||||||
|
dyndbg: unknown keyword "mode"
|
||||||
|
dyndbg: query parse failed
|
||||||
|
bash: echo: write error: Invalid argument
|
||||||
|
|
||||||
|
If debugfs is also enabled and mounted, ``dynamic_debug/control`` is
|
||||||
|
also under the mount-dir, typically ``/sys/kernel/debug/``.
|
||||||
|
|
||||||
Command Language Reference
|
Command Language Reference
|
||||||
==========================
|
==========================
|
||||||
|
|
||||||
At the lexical level, a command comprises a sequence of words separated
|
At the basic lexical level, a command is a sequence of words separated
|
||||||
by spaces or tabs. So these are all equivalent::
|
by spaces or tabs. So these are all equivalent::
|
||||||
|
|
||||||
nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' >
|
:#> ddcmd file svcsock.c line 1603 +p
|
||||||
<debugfs>/dynamic_debug/control
|
:#> ddcmd "file svcsock.c line 1603 +p"
|
||||||
nullarbor:~ # echo -n ' file svcsock.c line 1603 +p ' >
|
:#> ddcmd ' file svcsock.c line 1603 +p '
|
||||||
<debugfs>/dynamic_debug/control
|
|
||||||
nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' >
|
|
||||||
<debugfs>/dynamic_debug/control
|
|
||||||
|
|
||||||
Command submissions are bounded by a write() system call.
|
Command submissions are bounded by a write() system call.
|
||||||
Multiple commands can be written together, separated by ``;`` or ``\n``::
|
Multiple commands can be written together, separated by ``;`` or ``\n``::
|
||||||
|
|
||||||
~# echo "func pnpacpi_get_resources +p; func pnp_assign_mem +p" \
|
:#> ddcmd "func pnpacpi_get_resources +p; func pnp_assign_mem +p"
|
||||||
> <debugfs>/dynamic_debug/control
|
:#> ddcmd <<"EOC"
|
||||||
|
func pnpacpi_get_resources +p
|
||||||
|
func pnp_assign_mem +p
|
||||||
|
EOC
|
||||||
|
:#> cat query-batch-file > /proc/dynamic_debug/control
|
||||||
|
|
||||||
If your query set is big, you can batch them too::
|
You can also use wildcards in each query term. The match rule supports
|
||||||
|
``*`` (matches zero or more characters) and ``?`` (matches exactly one
|
||||||
|
character). For example, you can match all usb drivers::
|
||||||
|
|
||||||
~# cat query-batch-file > <debugfs>/dynamic_debug/control
|
:#> ddcmd file "drivers/usb/*" +p # "" to suppress shell expansion
|
||||||
|
|
||||||
Another way is to use wildcards. The match rule supports ``*`` (matches
|
Syntactically, a command is pairs of keyword values, followed by a
|
||||||
zero or more characters) and ``?`` (matches exactly one character). For
|
flags change or setting::
|
||||||
example, you can match all usb drivers::
|
|
||||||
|
|
||||||
~# echo "file drivers/usb/* +p" > <debugfs>/dynamic_debug/control
|
|
||||||
|
|
||||||
At the syntactical level, a command comprises a sequence of match
|
|
||||||
specifications, followed by a flags change specification::
|
|
||||||
|
|
||||||
command ::= match-spec* flags-spec
|
command ::= match-spec* flags-spec
|
||||||
|
|
||||||
The match-spec's are used to choose a subset of the known pr_debug()
|
The match-spec's select *prdbgs* from the catalog, upon which to apply
|
||||||
callsites to which to apply the flags-spec. Think of them as a query
|
the flags-spec, all constraints are ANDed together. An absent keyword
|
||||||
with implicit ANDs between each pair. Note that an empty list of
|
is the same as keyword "*".
|
||||||
match-specs will select all debug statement callsites.
|
|
||||||
|
|
||||||
A match specification comprises a keyword, which controls the
|
|
||||||
attribute of the callsite to be compared, and a value to compare
|
A match specification is a keyword, which selects the attribute of
|
||||||
against. Possible keywords are:::
|
the callsite to be compared, and a value to compare against. Possible
|
||||||
|
keywords are:::
|
||||||
|
|
||||||
match-spec ::= 'func' string |
|
match-spec ::= 'func' string |
|
||||||
'file' string |
|
'file' string |
|
||||||
'module' string |
|
'module' string |
|
||||||
'format' string |
|
'format' string |
|
||||||
|
'class' string |
|
||||||
'line' line-range
|
'line' line-range
|
||||||
|
|
||||||
line-range ::= lineno |
|
line-range ::= lineno |
|
||||||
@@ -203,6 +175,16 @@ format
|
|||||||
format "nfsd: SETATTR" // a neater way to match a format with whitespace
|
format "nfsd: SETATTR" // a neater way to match a format with whitespace
|
||||||
format 'nfsd: SETATTR' // yet another way to match a format with whitespace
|
format 'nfsd: SETATTR' // yet another way to match a format with whitespace
|
||||||
|
|
||||||
|
class
|
||||||
|
The given class_name is validated against each module, which may
|
||||||
|
have declared a list of known class_names. If the class_name is
|
||||||
|
found for a module, callsite & class matching and adjustment
|
||||||
|
proceeds. Examples::
|
||||||
|
|
||||||
|
class DRM_UT_KMS # a DRM.debug category
|
||||||
|
class JUNK # silent non-match
|
||||||
|
// class TLD_* # NOTICE: no wildcard in class names
|
||||||
|
|
||||||
line
|
line
|
||||||
The given line number or range of line numbers is compared
|
The given line number or range of line numbers is compared
|
||||||
against the line number of each ``pr_debug()`` callsite. A single
|
against the line number of each ``pr_debug()`` callsite. A single
|
||||||
@@ -228,17 +210,16 @@ of the characters::
|
|||||||
The flags are::
|
The flags are::
|
||||||
|
|
||||||
p enables the pr_debug() callsite.
|
p enables the pr_debug() callsite.
|
||||||
f Include the function name in the printed message
|
_ enables no flags.
|
||||||
l Include line number in the printed message
|
|
||||||
m Include module name in the printed message
|
|
||||||
t Include thread ID in messages not generated from interrupt context
|
|
||||||
_ No flags are set. (Or'd with others on input)
|
|
||||||
|
|
||||||
For ``print_hex_dump_debug()`` and ``print_hex_dump_bytes()``, only ``p`` flag
|
Decorator flags add to the message-prefix, in order:
|
||||||
have meaning, other flags ignored.
|
t Include thread ID, or <intr>
|
||||||
|
m Include module name
|
||||||
|
f Include the function name
|
||||||
|
l Include line number
|
||||||
|
|
||||||
For display, the flags are preceded by ``=``
|
For ``print_hex_dump_debug()`` and ``print_hex_dump_bytes()``, only
|
||||||
(mnemonic: what the flags are currently equal to).
|
the ``p`` flag has meaning, other flags are ignored.
|
||||||
|
|
||||||
Note the regexp ``^[-+=][flmpt_]+$`` matches a flags specification.
|
Note the regexp ``^[-+=][flmpt_]+$`` matches a flags specification.
|
||||||
To clear all flags at once, use ``=_`` or ``-flmpt``.
|
To clear all flags at once, use ``=_`` or ``-flmpt``.
|
||||||
@@ -313,7 +294,7 @@ For ``CONFIG_DYNAMIC_DEBUG`` kernels, any settings given at boot-time (or
|
|||||||
enabled by ``-DDEBUG`` flag during compilation) can be disabled later via
|
enabled by ``-DDEBUG`` flag during compilation) can be disabled later via
|
||||||
the debugfs interface if the debug messages are no longer needed::
|
the debugfs interface if the debug messages are no longer needed::
|
||||||
|
|
||||||
echo "module module_name -p" > <debugfs>/dynamic_debug/control
|
echo "module module_name -p" > /proc/dynamic_debug/control
|
||||||
|
|
||||||
Examples
|
Examples
|
||||||
========
|
========
|
||||||
@@ -321,37 +302,31 @@ Examples
|
|||||||
::
|
::
|
||||||
|
|
||||||
// enable the message at line 1603 of file svcsock.c
|
// enable the message at line 1603 of file svcsock.c
|
||||||
nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' >
|
:#> ddcmd 'file svcsock.c line 1603 +p'
|
||||||
<debugfs>/dynamic_debug/control
|
|
||||||
|
|
||||||
// enable all the messages in file svcsock.c
|
// enable all the messages in file svcsock.c
|
||||||
nullarbor:~ # echo -n 'file svcsock.c +p' >
|
:#> ddcmd 'file svcsock.c +p'
|
||||||
<debugfs>/dynamic_debug/control
|
|
||||||
|
|
||||||
// enable all the messages in the NFS server module
|
// enable all the messages in the NFS server module
|
||||||
nullarbor:~ # echo -n 'module nfsd +p' >
|
:#> ddcmd 'module nfsd +p'
|
||||||
<debugfs>/dynamic_debug/control
|
|
||||||
|
|
||||||
// enable all 12 messages in the function svc_process()
|
// enable all 12 messages in the function svc_process()
|
||||||
nullarbor:~ # echo -n 'func svc_process +p' >
|
:#> ddcmd 'func svc_process +p'
|
||||||
<debugfs>/dynamic_debug/control
|
|
||||||
|
|
||||||
// disable all 12 messages in the function svc_process()
|
// disable all 12 messages in the function svc_process()
|
||||||
nullarbor:~ # echo -n 'func svc_process -p' >
|
:#> ddcmd 'func svc_process -p'
|
||||||
<debugfs>/dynamic_debug/control
|
|
||||||
|
|
||||||
// enable messages for NFS calls READ, READLINK, READDIR and READDIR+.
|
// enable messages for NFS calls READ, READLINK, READDIR and READDIR+.
|
||||||
nullarbor:~ # echo -n 'format "nfsd: READ" +p' >
|
:#> ddcmd 'format "nfsd: READ" +p'
|
||||||
<debugfs>/dynamic_debug/control
|
|
||||||
|
|
||||||
// enable messages in files of which the paths include string "usb"
|
// enable messages in files of which the paths include string "usb"
|
||||||
nullarbor:~ # echo -n 'file *usb* +p' > <debugfs>/dynamic_debug/control
|
:#> ddcmd 'file *usb* +p' > /proc/dynamic_debug/control
|
||||||
|
|
||||||
// enable all messages
|
// enable all messages
|
||||||
nullarbor:~ # echo -n '+p' > <debugfs>/dynamic_debug/control
|
:#> ddcmd '+p' > /proc/dynamic_debug/control
|
||||||
|
|
||||||
// add module, function to all enabled messages
|
// add module, function to all enabled messages
|
||||||
nullarbor:~ # echo -n '+mf' > <debugfs>/dynamic_debug/control
|
:#> ddcmd '+mf' > /proc/dynamic_debug/control
|
||||||
|
|
||||||
// boot-args example, with newlines and comments for readability
|
// boot-args example, with newlines and comments for readability
|
||||||
Kernel command line: ...
|
Kernel command line: ...
|
||||||
@@ -364,3 +339,38 @@ Examples
|
|||||||
dyndbg="file init/* +p #cmt ; func parse_one +p"
|
dyndbg="file init/* +p #cmt ; func parse_one +p"
|
||||||
// enable pr_debugs in 2 functions in a module loaded later
|
// enable pr_debugs in 2 functions in a module loaded later
|
||||||
pc87360.dyndbg="func pc87360_init_device +p; func pc87360_find +p"
|
pc87360.dyndbg="func pc87360_init_device +p; func pc87360_find +p"
|
||||||
|
|
||||||
|
Kernel Configuration
|
||||||
|
====================
|
||||||
|
|
||||||
|
Dynamic Debug is enabled via kernel config items::
|
||||||
|
|
||||||
|
CONFIG_DYNAMIC_DEBUG=y # build catalog, enables CORE
|
||||||
|
CONFIG_DYNAMIC_DEBUG_CORE=y # enable mechanics only, skip catalog
|
||||||
|
|
||||||
|
If you do not want to enable dynamic debug globally (i.e. in some embedded
|
||||||
|
system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic
|
||||||
|
debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any
|
||||||
|
modules which you'd like to dynamically debug later.
|
||||||
|
|
||||||
|
|
||||||
|
Kernel *prdbg* API
|
||||||
|
==================
|
||||||
|
|
||||||
|
The following functions are cataloged and controllable when dynamic
|
||||||
|
debug is enabled::
|
||||||
|
|
||||||
|
pr_debug()
|
||||||
|
dev_dbg()
|
||||||
|
print_hex_dump_debug()
|
||||||
|
print_hex_dump_bytes()
|
||||||
|
|
||||||
|
Otherwise, they are off by default; ``ccflags += -DDEBUG`` or
|
||||||
|
``#define DEBUG`` in a source file will enable them appropriately.
|
||||||
|
|
||||||
|
If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is
|
||||||
|
just a shortcut for ``print_hex_dump(KERN_DEBUG)``.
|
||||||
|
|
||||||
|
For ``print_hex_dump_debug()``/``print_hex_dump_bytes()``, format string is
|
||||||
|
its ``prefix_str`` argument, if it is constant string; or ``hexdump``
|
||||||
|
in case ``prefix_str`` is built dynamically.
|
||||||
|
|||||||
@@ -230,6 +230,20 @@ The possible values in this file are:
|
|||||||
* - 'Mitigation: Clear CPU buffers'
|
* - 'Mitigation: Clear CPU buffers'
|
||||||
- The processor is vulnerable and the CPU buffer clearing mitigation is
|
- The processor is vulnerable and the CPU buffer clearing mitigation is
|
||||||
enabled.
|
enabled.
|
||||||
|
* - 'Unknown: No mitigations'
|
||||||
|
- The processor vulnerability status is unknown because it is
|
||||||
|
out of Servicing period. Mitigation is not attempted.
|
||||||
|
|
||||||
|
Definitions:
|
||||||
|
------------
|
||||||
|
|
||||||
|
Servicing period: The process of providing functional and security updates to
|
||||||
|
Intel processors or platforms, utilizing the Intel Platform Update (IPU)
|
||||||
|
process or other similar mechanisms.
|
||||||
|
|
||||||
|
End of Servicing Updates (ESU): ESU is the date at which Intel will no
|
||||||
|
longer provide Servicing, such as through IPU or other similar update
|
||||||
|
processes. ESU dates will typically be aligned to end of quarter.
|
||||||
|
|
||||||
If the processor is vulnerable then the following information is appended to
|
If the processor is vulnerable then the following information is appended to
|
||||||
the above information:
|
the above information:
|
||||||
|
|||||||
@@ -613,6 +613,7 @@ kernel command line.
|
|||||||
eibrs enhanced IBRS
|
eibrs enhanced IBRS
|
||||||
eibrs,retpoline enhanced IBRS + Retpolines
|
eibrs,retpoline enhanced IBRS + Retpolines
|
||||||
eibrs,lfence enhanced IBRS + LFENCE
|
eibrs,lfence enhanced IBRS + LFENCE
|
||||||
|
ibrs use IBRS to protect kernel
|
||||||
|
|
||||||
Not specifying this option is equivalent to
|
Not specifying this option is equivalent to
|
||||||
spectre_v2=auto.
|
spectre_v2=auto.
|
||||||
|
|||||||
@@ -200,7 +200,7 @@ prb
|
|||||||
|
|
||||||
A pointer to the printk ringbuffer (struct printk_ringbuffer). This
|
A pointer to the printk ringbuffer (struct printk_ringbuffer). This
|
||||||
may be pointing to the static boot ringbuffer or the dynamically
|
may be pointing to the static boot ringbuffer or the dynamically
|
||||||
allocated ringbuffer, depending on when the the core dump occurred.
|
allocated ringbuffer, depending on when the core dump occurred.
|
||||||
Used by user-space tools to read the active kernel log buffer.
|
Used by user-space tools to read the active kernel log buffer.
|
||||||
|
|
||||||
printk_rb_static
|
printk_rb_static
|
||||||
|
|||||||
@@ -321,6 +321,8 @@
|
|||||||
force_enable - Force enable the IOMMU on platforms known
|
force_enable - Force enable the IOMMU on platforms known
|
||||||
to be buggy with IOMMU enabled. Use this
|
to be buggy with IOMMU enabled. Use this
|
||||||
option with care.
|
option with care.
|
||||||
|
pgtbl_v1 - Use v1 page table for DMA-API (Default).
|
||||||
|
pgtbl_v2 - Use v2 page table for DMA-API.
|
||||||
|
|
||||||
amd_iommu_dump= [HW,X86-64]
|
amd_iommu_dump= [HW,X86-64]
|
||||||
Enable AMD IOMMU driver option to dump the ACPI table
|
Enable AMD IOMMU driver option to dump the ACPI table
|
||||||
@@ -966,10 +968,6 @@
|
|||||||
|
|
||||||
debugpat [X86] Enable PAT debugging
|
debugpat [X86] Enable PAT debugging
|
||||||
|
|
||||||
decnet.addr= [HW,NET]
|
|
||||||
Format: <area>[,<node>]
|
|
||||||
See also Documentation/networking/decnet.rst.
|
|
||||||
|
|
||||||
default_hugepagesz=
|
default_hugepagesz=
|
||||||
[HW] The size of the default HugeTLB page. This is
|
[HW] The size of the default HugeTLB page. This is
|
||||||
the size represented by the legacy /proc/ hugepages
|
the size represented by the legacy /proc/ hugepages
|
||||||
@@ -1471,6 +1469,14 @@
|
|||||||
Permit 'security.evm' to be updated regardless of
|
Permit 'security.evm' to be updated regardless of
|
||||||
current integrity status.
|
current integrity status.
|
||||||
|
|
||||||
|
early_page_ext [KNL] Enforces page_ext initialization to earlier
|
||||||
|
stages so cover more early boot allocations.
|
||||||
|
Please note that as side effect some optimizations
|
||||||
|
might be disabled to achieve that (e.g. parallelized
|
||||||
|
memory initialization is disabled) so the boot process
|
||||||
|
might take longer, especially on systems with a lot of
|
||||||
|
memory. Available with CONFIG_PAGE_EXTENSION=y.
|
||||||
|
|
||||||
failslab=
|
failslab=
|
||||||
fail_usercopy=
|
fail_usercopy=
|
||||||
fail_page_alloc=
|
fail_page_alloc=
|
||||||
@@ -2436,6 +2442,12 @@
|
|||||||
0: force disabled
|
0: force disabled
|
||||||
1: force enabled
|
1: force enabled
|
||||||
|
|
||||||
|
kunit.enable= [KUNIT] Enable executing KUnit tests. Requires
|
||||||
|
CONFIG_KUNIT to be set to be fully enabled. The
|
||||||
|
default value can be overridden via
|
||||||
|
KUNIT_DEFAULT_ENABLED.
|
||||||
|
Default is 1 (enabled)
|
||||||
|
|
||||||
kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
|
kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
|
||||||
Default is 0 (don't ignore, but inject #GP)
|
Default is 0 (don't ignore, but inject #GP)
|
||||||
|
|
||||||
@@ -3207,6 +3219,7 @@
|
|||||||
spectre_v2_user=off [X86]
|
spectre_v2_user=off [X86]
|
||||||
spec_store_bypass_disable=off [X86,PPC]
|
spec_store_bypass_disable=off [X86,PPC]
|
||||||
ssbd=force-off [ARM64]
|
ssbd=force-off [ARM64]
|
||||||
|
nospectre_bhb [ARM64]
|
||||||
l1tf=off [X86]
|
l1tf=off [X86]
|
||||||
mds=off [X86]
|
mds=off [X86]
|
||||||
tsx_async_abort=off [X86]
|
tsx_async_abort=off [X86]
|
||||||
@@ -3613,7 +3626,7 @@
|
|||||||
|
|
||||||
nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings.
|
nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings.
|
||||||
|
|
||||||
nohugevmalloc [PPC] Disable kernel huge vmalloc mappings.
|
nohugevmalloc [KNL,X86,PPC,ARM64] Disable kernel huge vmalloc mappings.
|
||||||
|
|
||||||
nosmt [KNL,S390] Disable symmetric multithreading (SMT).
|
nosmt [KNL,S390] Disable symmetric multithreading (SMT).
|
||||||
Equivalent to smt=1.
|
Equivalent to smt=1.
|
||||||
@@ -3626,11 +3639,15 @@
|
|||||||
(bounds check bypass). With this option data leaks are
|
(bounds check bypass). With this option data leaks are
|
||||||
possible in the system.
|
possible in the system.
|
||||||
|
|
||||||
nospectre_v2 [X86,PPC_FSL_BOOK3E,ARM64] Disable all mitigations for
|
nospectre_v2 [X86,PPC_E500,ARM64] Disable all mitigations for
|
||||||
the Spectre variant 2 (indirect branch prediction)
|
the Spectre variant 2 (indirect branch prediction)
|
||||||
vulnerability. System may allow data leaks with this
|
vulnerability. System may allow data leaks with this
|
||||||
option.
|
option.
|
||||||
|
|
||||||
|
nospectre_bhb [ARM64] Disable all mitigations for Spectre-BHB (branch
|
||||||
|
history injection) vulnerability. System may allow data leaks
|
||||||
|
with this option.
|
||||||
|
|
||||||
nospec_store_bypass_disable
|
nospec_store_bypass_disable
|
||||||
[HW] Disable all mitigations for the Speculative Store Bypass vulnerability
|
[HW] Disable all mitigations for the Speculative Store Bypass vulnerability
|
||||||
|
|
||||||
@@ -3741,9 +3758,9 @@
|
|||||||
[X86,PV_OPS] Disable paravirtualized VMware scheduler
|
[X86,PV_OPS] Disable paravirtualized VMware scheduler
|
||||||
clock and use the default one.
|
clock and use the default one.
|
||||||
|
|
||||||
no-steal-acc [X86,PV_OPS,ARM64] Disable paravirtualized steal time
|
no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES] Disable paravirtualized
|
||||||
accounting. steal time is computed, but won't
|
steal time accounting. steal time is computed, but
|
||||||
influence scheduler behaviour
|
won't influence scheduler behaviour
|
||||||
|
|
||||||
nolapic [X86-32,APIC] Do not enable or use the local APIC.
|
nolapic [X86-32,APIC] Do not enable or use the local APIC.
|
||||||
|
|
||||||
@@ -3805,6 +3822,10 @@
|
|||||||
|
|
||||||
nox2apic [X86-64,APIC] Do not enable x2APIC mode.
|
nox2apic [X86-64,APIC] Do not enable x2APIC mode.
|
||||||
|
|
||||||
|
NOTE: this parameter will be ignored on systems with the
|
||||||
|
LEGACY_XAPIC_DISABLED bit set in the
|
||||||
|
IA32_XAPIC_DISABLE_STATUS MSR.
|
||||||
|
|
||||||
nps_mtm_hs_ctr= [KNL,ARC]
|
nps_mtm_hs_ctr= [KNL,ARC]
|
||||||
This parameter sets the maximum duration, in
|
This parameter sets the maximum duration, in
|
||||||
cycles, each HW thread of the CTOP can run
|
cycles, each HW thread of the CTOP can run
|
||||||
@@ -5331,6 +5352,8 @@
|
|||||||
rodata= [KNL]
|
rodata= [KNL]
|
||||||
on Mark read-only kernel memory as read-only (default).
|
on Mark read-only kernel memory as read-only (default).
|
||||||
off Leave read-only kernel memory writable for debugging.
|
off Leave read-only kernel memory writable for debugging.
|
||||||
|
full Mark read-only kernel memory and aliases as read-only
|
||||||
|
[arm64]
|
||||||
|
|
||||||
rockchip.usb_uart
|
rockchip.usb_uart
|
||||||
Enable the uart passthrough on the designated usb port
|
Enable the uart passthrough on the designated usb port
|
||||||
@@ -6026,12 +6049,6 @@
|
|||||||
This parameter controls use of the Protected
|
This parameter controls use of the Protected
|
||||||
Execution Facility on pSeries.
|
Execution Facility on pSeries.
|
||||||
|
|
||||||
swapaccount= [KNL]
|
|
||||||
Format: [0|1]
|
|
||||||
Enable accounting of swap in memory resource
|
|
||||||
controller if no parameter or 1 is given or disable
|
|
||||||
it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst)
|
|
||||||
|
|
||||||
swiotlb= [ARM,IA-64,PPC,MIPS,X86]
|
swiotlb= [ARM,IA-64,PPC,MIPS,X86]
|
||||||
Format: { <int> [,<int>] | force | noforce }
|
Format: { <int> [,<int>] | force | noforce }
|
||||||
<int> -- Number of I/O TLB slabs
|
<int> -- Number of I/O TLB slabs
|
||||||
@@ -6834,6 +6851,12 @@
|
|||||||
Crash from Xen panic notifier, without executing late
|
Crash from Xen panic notifier, without executing late
|
||||||
panic() code such as dumping handler.
|
panic() code such as dumping handler.
|
||||||
|
|
||||||
|
xen_msr_safe= [X86,XEN]
|
||||||
|
Format: <bool>
|
||||||
|
Select whether to always use non-faulting (safe) MSR
|
||||||
|
access functions when running as Xen PV guest. The
|
||||||
|
default value is controlled by CONFIG_XEN_PV_MSR_SAFE.
|
||||||
|
|
||||||
xen_nopvspin [X86,XEN]
|
xen_nopvspin [X86,XEN]
|
||||||
Disables the qspinlock slowpath using Xen PV optimizations.
|
Disables the qspinlock slowpath using Xen PV optimizations.
|
||||||
This parameter is obsoleted by "nopvspin" parameter, which
|
This parameter is obsoleted by "nopvspin" parameter, which
|
||||||
|
|||||||
@@ -5,10 +5,10 @@ CMA Debugfs Interface
|
|||||||
The CMA debugfs interface is useful to retrieve basic information out of the
|
The CMA debugfs interface is useful to retrieve basic information out of the
|
||||||
different CMA areas and to test allocation/release in each of the areas.
|
different CMA areas and to test allocation/release in each of the areas.
|
||||||
|
|
||||||
Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
|
Each CMA area represents a directory under <debugfs>/cma/, represented by
|
||||||
kernel's CMA index. So the first CMA zone would be:
|
its CMA name like below:
|
||||||
|
|
||||||
<debugfs>/cma/cma-0
|
<debugfs>/cma/<cma_name>
|
||||||
|
|
||||||
The structure of the files created under that directory is as follows:
|
The structure of the files created under that directory is as follows:
|
||||||
|
|
||||||
@@ -18,8 +18,8 @@ The structure of the files created under that directory is as follows:
|
|||||||
- [RO] bitmap: The bitmap of page states in the zone.
|
- [RO] bitmap: The bitmap of page states in the zone.
|
||||||
- [WO] alloc: Allocate N pages from that CMA area. For example::
|
- [WO] alloc: Allocate N pages from that CMA area. For example::
|
||||||
|
|
||||||
echo 5 > <debugfs>/cma/cma-2/alloc
|
echo 5 > <debugfs>/cma/<cma_name>/alloc
|
||||||
|
|
||||||
would try to allocate 5 pages from the cma-2 area.
|
would try to allocate 5 pages from the 'cma_name' area.
|
||||||
|
|
||||||
- [WO] free: Free N pages from that CMA area, similar to the above.
|
- [WO] free: Free N pages from that CMA area, similar to the above.
|
||||||
|
|||||||
@@ -1,8 +1,8 @@
|
|||||||
.. SPDX-License-Identifier: GPL-2.0
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
========================
|
==========================
|
||||||
Monitoring Data Accesses
|
DAMON: Data Access MONitor
|
||||||
========================
|
==========================
|
||||||
|
|
||||||
:doc:`DAMON </mm/damon/index>` allows light-weight data access monitoring.
|
:doc:`DAMON </mm/damon/index>` allows light-weight data access monitoring.
|
||||||
Using DAMON, users can analyze the memory access patterns of their systems and
|
Using DAMON, users can analyze the memory access patterns of their systems and
|
||||||
|
|||||||
@@ -29,16 +29,9 @@ called DAMON Operator (DAMO). It is available at
|
|||||||
https://github.com/awslabs/damo. The examples below assume that ``damo`` is on
|
https://github.com/awslabs/damo. The examples below assume that ``damo`` is on
|
||||||
your ``$PATH``. It's not mandatory, though.
|
your ``$PATH``. It's not mandatory, though.
|
||||||
|
|
||||||
Because DAMO is using the debugfs interface (refer to :doc:`usage` for the
|
Because DAMO is using the sysfs interface (refer to :doc:`usage` for the
|
||||||
detail) of DAMON, you should ensure debugfs is mounted. Mount it manually as
|
detail) of DAMON, you should ensure :doc:`sysfs </filesystems/sysfs>` is
|
||||||
below::
|
mounted.
|
||||||
|
|
||||||
# mount -t debugfs none /sys/kernel/debug/
|
|
||||||
|
|
||||||
or append the following line to your ``/etc/fstab`` file so that your system
|
|
||||||
can automatically mount debugfs upon booting::
|
|
||||||
|
|
||||||
debugfs /sys/kernel/debug debugfs defaults 0 0
|
|
||||||
|
|
||||||
|
|
||||||
Recording Data Access Patterns
|
Recording Data Access Patterns
|
||||||
|
|||||||
@@ -50,10 +50,10 @@ For a short example, users can monitor the virtual address space of a given
|
|||||||
workload as below. ::
|
workload as below. ::
|
||||||
|
|
||||||
# cd /sys/kernel/mm/damon/admin/
|
# cd /sys/kernel/mm/damon/admin/
|
||||||
# echo 1 > kdamonds/nr && echo 1 > kdamonds/0/contexts/nr
|
# echo 1 > kdamonds/nr_kdamonds && echo 1 > kdamonds/0/contexts/nr_contexts
|
||||||
# echo vaddr > kdamonds/0/contexts/0/operations
|
# echo vaddr > kdamonds/0/contexts/0/operations
|
||||||
# echo 1 > kdamonds/0/contexts/0/targets/nr
|
# echo 1 > kdamonds/0/contexts/0/targets/nr_targets
|
||||||
# echo $(pidof <workload>) > kdamonds/0/contexts/0/targets/0/pid
|
# echo $(pidof <workload>) > kdamonds/0/contexts/0/targets/0/pid_target
|
||||||
# echo on > kdamonds/0/state
|
# echo on > kdamonds/0/state
|
||||||
|
|
||||||
Files Hierarchy
|
Files Hierarchy
|
||||||
@@ -366,12 +366,12 @@ memory rate becomes larger than 60%, or lower than 30%". ::
|
|||||||
# echo 1 > kdamonds/0/contexts/0/schemes/nr_schemes
|
# echo 1 > kdamonds/0/contexts/0/schemes/nr_schemes
|
||||||
# cd kdamonds/0/contexts/0/schemes/0
|
# cd kdamonds/0/contexts/0/schemes/0
|
||||||
# # set the basic access pattern and the action
|
# # set the basic access pattern and the action
|
||||||
# echo 4096 > access_patterns/sz/min
|
# echo 4096 > access_pattern/sz/min
|
||||||
# echo 8192 > access_patterns/sz/max
|
# echo 8192 > access_pattern/sz/max
|
||||||
# echo 0 > access_patterns/nr_accesses/min
|
# echo 0 > access_pattern/nr_accesses/min
|
||||||
# echo 5 > access_patterns/nr_accesses/max
|
# echo 5 > access_pattern/nr_accesses/max
|
||||||
# echo 10 > access_patterns/age/min
|
# echo 10 > access_pattern/age/min
|
||||||
# echo 20 > access_patterns/age/max
|
# echo 20 > access_pattern/age/max
|
||||||
# echo pageout > action
|
# echo pageout > action
|
||||||
# # set quotas
|
# # set quotas
|
||||||
# echo 10 > quotas/ms
|
# echo 10 > quotas/ms
|
||||||
@@ -393,6 +393,11 @@ the files as above. Above is only for an example.
|
|||||||
debugfs Interface
|
debugfs Interface
|
||||||
=================
|
=================
|
||||||
|
|
||||||
|
.. note::
|
||||||
|
|
||||||
|
DAMON debugfs interface will be removed after next LTS kernel is released, so
|
||||||
|
users should move to the :ref:`sysfs interface <sysfs_interface>`.
|
||||||
|
|
||||||
DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
|
DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
|
||||||
``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
|
``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
|
||||||
``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
|
``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ HugePages_Surp
|
|||||||
may be temporarily larger than the maximum number of surplus huge
|
may be temporarily larger than the maximum number of surplus huge
|
||||||
pages when the system is under memory pressure.
|
pages when the system is under memory pressure.
|
||||||
Hugepagesize
|
Hugepagesize
|
||||||
is the default hugepage size (in Kb).
|
is the default hugepage size (in kB).
|
||||||
Hugetlb
|
Hugetlb
|
||||||
is the total amount of memory (in kB), consumed by huge
|
is the total amount of memory (in kB), consumed by huge
|
||||||
pages of all sizes.
|
pages of all sizes.
|
||||||
|
|||||||
@@ -32,6 +32,7 @@ the Linux memory management.
|
|||||||
idle_page_tracking
|
idle_page_tracking
|
||||||
ksm
|
ksm
|
||||||
memory-hotplug
|
memory-hotplug
|
||||||
|
multigen_lru
|
||||||
nommu-mmap
|
nommu-mmap
|
||||||
numa_memory_policy
|
numa_memory_policy
|
||||||
numaperf
|
numaperf
|
||||||
|
|||||||
@@ -184,6 +184,42 @@ The maximum possible ``pages_sharing/pages_shared`` ratio is limited by the
|
|||||||
``max_page_sharing`` tunable. To increase the ratio ``max_page_sharing`` must
|
``max_page_sharing`` tunable. To increase the ratio ``max_page_sharing`` must
|
||||||
be increased accordingly.
|
be increased accordingly.
|
||||||
|
|
||||||
|
Monitoring KSM profit
|
||||||
|
=====================
|
||||||
|
|
||||||
|
KSM can save memory by merging identical pages, but also can consume
|
||||||
|
additional memory, because it needs to generate a number of rmap_items to
|
||||||
|
save each scanned page's brief rmap information. Some of these pages may
|
||||||
|
be merged, but some may not be abled to be merged after being checked
|
||||||
|
several times, which are unprofitable memory consumed.
|
||||||
|
|
||||||
|
1) How to determine whether KSM save memory or consume memory in system-wide
|
||||||
|
range? Here is a simple approximate calculation for reference::
|
||||||
|
|
||||||
|
general_profit =~ pages_sharing * sizeof(page) - (all_rmap_items) *
|
||||||
|
sizeof(rmap_item);
|
||||||
|
|
||||||
|
where all_rmap_items can be easily obtained by summing ``pages_sharing``,
|
||||||
|
``pages_shared``, ``pages_unshared`` and ``pages_volatile``.
|
||||||
|
|
||||||
|
2) The KSM profit inner a single process can be similarly obtained by the
|
||||||
|
following approximate calculation::
|
||||||
|
|
||||||
|
process_profit =~ ksm_merging_pages * sizeof(page) -
|
||||||
|
ksm_rmap_items * sizeof(rmap_item).
|
||||||
|
|
||||||
|
where ksm_merging_pages is shown under the directory ``/proc/<pid>/``,
|
||||||
|
and ksm_rmap_items is shown in ``/proc/<pid>/ksm_stat``.
|
||||||
|
|
||||||
|
From the perspective of application, a high ratio of ``ksm_rmap_items`` to
|
||||||
|
``ksm_merging_pages`` means a bad madvise-applied policy, so developers or
|
||||||
|
administrators have to rethink how to change madvise policy. Giving an example
|
||||||
|
for reference, a page's size is usually 4K, and the rmap_item's size is
|
||||||
|
separately 32B on 32-bit CPU architecture and 64B on 64-bit CPU architecture.
|
||||||
|
so if the ``ksm_rmap_items/ksm_merging_pages`` ratio exceeds 64 on 64-bit CPU
|
||||||
|
or exceeds 128 on 32-bit CPU, then the app's madvise policy should be dropped,
|
||||||
|
because the ksm profit is approximately zero or negative.
|
||||||
|
|
||||||
Monitoring KSM events
|
Monitoring KSM events
|
||||||
=====================
|
=====================
|
||||||
|
|
||||||
|
|||||||
162
Documentation/admin-guide/mm/multigen_lru.rst
Normal file
162
Documentation/admin-guide/mm/multigen_lru.rst
Normal file
@@ -0,0 +1,162 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
=============
|
||||||
|
Multi-Gen LRU
|
||||||
|
=============
|
||||||
|
The multi-gen LRU is an alternative LRU implementation that optimizes
|
||||||
|
page reclaim and improves performance under memory pressure. Page
|
||||||
|
reclaim decides the kernel's caching policy and ability to overcommit
|
||||||
|
memory. It directly impacts the kswapd CPU usage and RAM efficiency.
|
||||||
|
|
||||||
|
Quick start
|
||||||
|
===========
|
||||||
|
Build the kernel with the following configurations.
|
||||||
|
|
||||||
|
* ``CONFIG_LRU_GEN=y``
|
||||||
|
* ``CONFIG_LRU_GEN_ENABLED=y``
|
||||||
|
|
||||||
|
All set!
|
||||||
|
|
||||||
|
Runtime options
|
||||||
|
===============
|
||||||
|
``/sys/kernel/mm/lru_gen/`` contains stable ABIs described in the
|
||||||
|
following subsections.
|
||||||
|
|
||||||
|
Kill switch
|
||||||
|
-----------
|
||||||
|
``enabled`` accepts different values to enable or disable the
|
||||||
|
following components. Its default value depends on
|
||||||
|
``CONFIG_LRU_GEN_ENABLED``. All the components should be enabled
|
||||||
|
unless some of them have unforeseen side effects. Writing to
|
||||||
|
``enabled`` has no effect when a component is not supported by the
|
||||||
|
hardware, and valid values will be accepted even when the main switch
|
||||||
|
is off.
|
||||||
|
|
||||||
|
====== ===============================================================
|
||||||
|
Values Components
|
||||||
|
====== ===============================================================
|
||||||
|
0x0001 The main switch for the multi-gen LRU.
|
||||||
|
0x0002 Clearing the accessed bit in leaf page table entries in large
|
||||||
|
batches, when MMU sets it (e.g., on x86). This behavior can
|
||||||
|
theoretically worsen lock contention (mmap_lock). If it is
|
||||||
|
disabled, the multi-gen LRU will suffer a minor performance
|
||||||
|
degradation for workloads that contiguously map hot pages,
|
||||||
|
whose accessed bits can be otherwise cleared by fewer larger
|
||||||
|
batches.
|
||||||
|
0x0004 Clearing the accessed bit in non-leaf page table entries as
|
||||||
|
well, when MMU sets it (e.g., on x86). This behavior was not
|
||||||
|
verified on x86 varieties other than Intel and AMD. If it is
|
||||||
|
disabled, the multi-gen LRU will suffer a negligible
|
||||||
|
performance degradation.
|
||||||
|
[yYnN] Apply to all the components above.
|
||||||
|
====== ===============================================================
|
||||||
|
|
||||||
|
E.g.,
|
||||||
|
::
|
||||||
|
|
||||||
|
echo y >/sys/kernel/mm/lru_gen/enabled
|
||||||
|
cat /sys/kernel/mm/lru_gen/enabled
|
||||||
|
0x0007
|
||||||
|
echo 5 >/sys/kernel/mm/lru_gen/enabled
|
||||||
|
cat /sys/kernel/mm/lru_gen/enabled
|
||||||
|
0x0005
|
||||||
|
|
||||||
|
Thrashing prevention
|
||||||
|
--------------------
|
||||||
|
Personal computers are more sensitive to thrashing because it can
|
||||||
|
cause janks (lags when rendering UI) and negatively impact user
|
||||||
|
experience. The multi-gen LRU offers thrashing prevention to the
|
||||||
|
majority of laptop and desktop users who do not have ``oomd``.
|
||||||
|
|
||||||
|
Users can write ``N`` to ``min_ttl_ms`` to prevent the working set of
|
||||||
|
``N`` milliseconds from getting evicted. The OOM killer is triggered
|
||||||
|
if this working set cannot be kept in memory. In other words, this
|
||||||
|
option works as an adjustable pressure relief valve, and when open, it
|
||||||
|
terminates applications that are hopefully not being used.
|
||||||
|
|
||||||
|
Based on the average human detectable lag (~100ms), ``N=1000`` usually
|
||||||
|
eliminates intolerable janks due to thrashing. Larger values like
|
||||||
|
``N=3000`` make janks less noticeable at the risk of premature OOM
|
||||||
|
kills.
|
||||||
|
|
||||||
|
The default value ``0`` means disabled.
|
||||||
|
|
||||||
|
Experimental features
|
||||||
|
=====================
|
||||||
|
``/sys/kernel/debug/lru_gen`` accepts commands described in the
|
||||||
|
following subsections. Multiple command lines are supported, so does
|
||||||
|
concatenation with delimiters ``,`` and ``;``.
|
||||||
|
|
||||||
|
``/sys/kernel/debug/lru_gen_full`` provides additional stats for
|
||||||
|
debugging. ``CONFIG_LRU_GEN_STATS=y`` keeps historical stats from
|
||||||
|
evicted generations in this file.
|
||||||
|
|
||||||
|
Working set estimation
|
||||||
|
----------------------
|
||||||
|
Working set estimation measures how much memory an application needs
|
||||||
|
in a given time interval, and it is usually done with little impact on
|
||||||
|
the performance of the application. E.g., data centers want to
|
||||||
|
optimize job scheduling (bin packing) to improve memory utilizations.
|
||||||
|
When a new job comes in, the job scheduler needs to find out whether
|
||||||
|
each server it manages can allocate a certain amount of memory for
|
||||||
|
this new job before it can pick a candidate. To do so, the job
|
||||||
|
scheduler needs to estimate the working sets of the existing jobs.
|
||||||
|
|
||||||
|
When it is read, ``lru_gen`` returns a histogram of numbers of pages
|
||||||
|
accessed over different time intervals for each memcg and node.
|
||||||
|
``MAX_NR_GENS`` decides the number of bins for each histogram. The
|
||||||
|
histograms are noncumulative.
|
||||||
|
::
|
||||||
|
|
||||||
|
memcg memcg_id memcg_path
|
||||||
|
node node_id
|
||||||
|
min_gen_nr age_in_ms nr_anon_pages nr_file_pages
|
||||||
|
...
|
||||||
|
max_gen_nr age_in_ms nr_anon_pages nr_file_pages
|
||||||
|
|
||||||
|
Each bin contains an estimated number of pages that have been accessed
|
||||||
|
within ``age_in_ms``. E.g., ``min_gen_nr`` contains the coldest pages
|
||||||
|
and ``max_gen_nr`` contains the hottest pages, since ``age_in_ms`` of
|
||||||
|
the former is the largest and that of the latter is the smallest.
|
||||||
|
|
||||||
|
Users can write the following command to ``lru_gen`` to create a new
|
||||||
|
generation ``max_gen_nr+1``:
|
||||||
|
|
||||||
|
``+ memcg_id node_id max_gen_nr [can_swap [force_scan]]``
|
||||||
|
|
||||||
|
``can_swap`` defaults to the swap setting and, if it is set to ``1``,
|
||||||
|
it forces the scan of anon pages when swap is off, and vice versa.
|
||||||
|
``force_scan`` defaults to ``1`` and, if it is set to ``0``, it
|
||||||
|
employs heuristics to reduce the overhead, which is likely to reduce
|
||||||
|
the coverage as well.
|
||||||
|
|
||||||
|
A typical use case is that a job scheduler runs this command at a
|
||||||
|
certain time interval to create new generations, and it ranks the
|
||||||
|
servers it manages based on the sizes of their cold pages defined by
|
||||||
|
this time interval.
|
||||||
|
|
||||||
|
Proactive reclaim
|
||||||
|
-----------------
|
||||||
|
Proactive reclaim induces page reclaim when there is no memory
|
||||||
|
pressure. It usually targets cold pages only. E.g., when a new job
|
||||||
|
comes in, the job scheduler wants to proactively reclaim cold pages on
|
||||||
|
the server it selected, to improve the chance of successfully landing
|
||||||
|
this new job.
|
||||||
|
|
||||||
|
Users can write the following command to ``lru_gen`` to evict
|
||||||
|
generations less than or equal to ``min_gen_nr``.
|
||||||
|
|
||||||
|
``- memcg_id node_id min_gen_nr [swappiness [nr_to_reclaim]]``
|
||||||
|
|
||||||
|
``min_gen_nr`` should be less than ``max_gen_nr-1``, since
|
||||||
|
``max_gen_nr`` and ``max_gen_nr-1`` are not fully aged (equivalent to
|
||||||
|
the active list) and therefore cannot be evicted. ``swappiness``
|
||||||
|
overrides the default value in ``/proc/sys/vm/swappiness``.
|
||||||
|
``nr_to_reclaim`` limits the number of pages to evict.
|
||||||
|
|
||||||
|
A typical use case is that a job scheduler runs this command before it
|
||||||
|
tries to land a new job on a server. If it fails to materialize enough
|
||||||
|
cold pages because of the overestimation, it retries on the next
|
||||||
|
server according to the ranking result obtained from the working set
|
||||||
|
estimation step. This less forceful approach limits the impacts on the
|
||||||
|
existing jobs.
|
||||||
@@ -191,7 +191,14 @@ allocation failure to throttle the next allocation attempt::
|
|||||||
|
|
||||||
/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
|
/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
|
||||||
|
|
||||||
The khugepaged progress can be seen in the number of pages collapsed::
|
The khugepaged progress can be seen in the number of pages collapsed (note
|
||||||
|
that this counter may not be an exact count of the number of pages
|
||||||
|
collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping
|
||||||
|
being replaced by a PMD mapping, or (2) All 4K physical pages replaced by
|
||||||
|
one 2M hugepage. Each may happen independently, or together, depending on
|
||||||
|
the type of memory and the failures that occur. As such, this value should
|
||||||
|
be interpreted roughly as a sign of progress, and counters in /proc/vmstat
|
||||||
|
consulted for more accurate accounting)::
|
||||||
|
|
||||||
/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
|
/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
|
||||||
|
|
||||||
@@ -366,10 +373,9 @@ thp_split_pmd
|
|||||||
page table entry.
|
page table entry.
|
||||||
|
|
||||||
thp_zero_page_alloc
|
thp_zero_page_alloc
|
||||||
is incremented every time a huge zero page is
|
is incremented every time a huge zero page used for thp is
|
||||||
successfully allocated. It includes allocations which where
|
successfully allocated. Note, it doesn't count every map of
|
||||||
dropped due race with other allocation. Note, it doesn't count
|
the huge zero page, only its allocation.
|
||||||
every map of the huge zero page, only its allocation.
|
|
||||||
|
|
||||||
thp_zero_page_alloc_failed
|
thp_zero_page_alloc_failed
|
||||||
is incremented if kernel fails to allocate
|
is incremented if kernel fails to allocate
|
||||||
|
|||||||
@@ -17,7 +17,10 @@ of the ``PROT_NONE+SIGSEGV`` trick.
|
|||||||
Design
|
Design
|
||||||
======
|
======
|
||||||
|
|
||||||
Userfaults are delivered and resolved through the ``userfaultfd`` syscall.
|
Userspace creates a new userfaultfd, initializes it, and registers one or more
|
||||||
|
regions of virtual memory with it. Then, any page faults which occur within the
|
||||||
|
region(s) result in a message being delivered to the userfaultfd, notifying
|
||||||
|
userspace of the fault.
|
||||||
|
|
||||||
The ``userfaultfd`` (aside from registering and unregistering virtual
|
The ``userfaultfd`` (aside from registering and unregistering virtual
|
||||||
memory ranges) provides two primary functionalities:
|
memory ranges) provides two primary functionalities:
|
||||||
@@ -34,12 +37,11 @@ The real advantage of userfaults if compared to regular virtual memory
|
|||||||
management of mremap/mprotect is that the userfaults in all their
|
management of mremap/mprotect is that the userfaults in all their
|
||||||
operations never involve heavyweight structures like vmas (in fact the
|
operations never involve heavyweight structures like vmas (in fact the
|
||||||
``userfaultfd`` runtime load never takes the mmap_lock for writing).
|
``userfaultfd`` runtime load never takes the mmap_lock for writing).
|
||||||
|
|
||||||
Vmas are not suitable for page- (or hugepage) granular fault tracking
|
Vmas are not suitable for page- (or hugepage) granular fault tracking
|
||||||
when dealing with virtual address spaces that could span
|
when dealing with virtual address spaces that could span
|
||||||
Terabytes. Too many vmas would be needed for that.
|
Terabytes. Too many vmas would be needed for that.
|
||||||
|
|
||||||
The ``userfaultfd`` once opened by invoking the syscall, can also be
|
The ``userfaultfd``, once created, can also be
|
||||||
passed using unix domain sockets to a manager process, so the same
|
passed using unix domain sockets to a manager process, so the same
|
||||||
manager process could handle the userfaults of a multitude of
|
manager process could handle the userfaults of a multitude of
|
||||||
different processes without them being aware about what is going on
|
different processes without them being aware about what is going on
|
||||||
@@ -50,6 +52,39 @@ is a corner case that would currently return ``-EBUSY``).
|
|||||||
API
|
API
|
||||||
===
|
===
|
||||||
|
|
||||||
|
Creating a userfaultfd
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
There are two ways to create a new userfaultfd, each of which provide ways to
|
||||||
|
restrict access to this functionality (since historically userfaultfds which
|
||||||
|
handle kernel page faults have been a useful tool for exploiting the kernel).
|
||||||
|
|
||||||
|
The first way, supported since userfaultfd was introduced, is the
|
||||||
|
userfaultfd(2) syscall. Access to this is controlled in several ways:
|
||||||
|
|
||||||
|
- Any user can always create a userfaultfd which traps userspace page faults
|
||||||
|
only. Such a userfaultfd can be created using the userfaultfd(2) syscall
|
||||||
|
with the flag UFFD_USER_MODE_ONLY.
|
||||||
|
|
||||||
|
- In order to also trap kernel page faults for the address space, either the
|
||||||
|
process needs the CAP_SYS_PTRACE capability, or the system must have
|
||||||
|
vm.unprivileged_userfaultfd set to 1. By default, vm.unprivileged_userfaultfd
|
||||||
|
is set to 0.
|
||||||
|
|
||||||
|
The second way, added to the kernel more recently, is by opening
|
||||||
|
/dev/userfaultfd and issuing a USERFAULTFD_IOC_NEW ioctl to it. This method
|
||||||
|
yields equivalent userfaultfds to the userfaultfd(2) syscall.
|
||||||
|
|
||||||
|
Unlike userfaultfd(2), access to /dev/userfaultfd is controlled via normal
|
||||||
|
filesystem permissions (user/group/mode), which gives fine grained access to
|
||||||
|
userfaultfd specifically, without also granting other unrelated privileges at
|
||||||
|
the same time (as e.g. granting CAP_SYS_PTRACE would do). Users who have access
|
||||||
|
to /dev/userfaultfd can always create userfaultfds that trap kernel page faults;
|
||||||
|
vm.unprivileged_userfaultfd is not considered.
|
||||||
|
|
||||||
|
Initializing a userfaultfd
|
||||||
|
--------------------------
|
||||||
|
|
||||||
When first opened the ``userfaultfd`` must be enabled invoking the
|
When first opened the ``userfaultfd`` must be enabled invoking the
|
||||||
``UFFDIO_API`` ioctl specifying a ``uffdio_api.api`` value set to ``UFFD_API`` (or
|
``UFFDIO_API`` ioctl specifying a ``uffdio_api.api`` value set to ``UFFD_API`` (or
|
||||||
a later API version) which will specify the ``read/POLLIN`` protocol
|
a later API version) which will specify the ``read/POLLIN`` protocol
|
||||||
|
|||||||
100
Documentation/admin-guide/perf/alibaba_pmu.rst
Normal file
100
Documentation/admin-guide/perf/alibaba_pmu.rst
Normal file
@@ -0,0 +1,100 @@
|
|||||||
|
=============================================================
|
||||||
|
Alibaba's T-Head SoC Uncore Performance Monitoring Unit (PMU)
|
||||||
|
=============================================================
|
||||||
|
|
||||||
|
The Yitian 710, custom-built by Alibaba Group's chip development business,
|
||||||
|
T-Head, implements uncore PMU for performance and functional debugging to
|
||||||
|
facilitate system maintenance.
|
||||||
|
|
||||||
|
DDR Sub-System Driveway (DRW) PMU Driver
|
||||||
|
=========================================
|
||||||
|
|
||||||
|
Yitian 710 employs eight DDR5/4 channels, four on each die. Each DDR5 channel
|
||||||
|
is independent of others to service system memory requests. And one DDR5
|
||||||
|
channel is split into two independent sub-channels. The DDR Sub-System Driveway
|
||||||
|
implements separate PMUs for each sub-channel to monitor various performance
|
||||||
|
metrics.
|
||||||
|
|
||||||
|
The Driveway PMU devices are named as ali_drw_<sys_base_addr> with perf.
|
||||||
|
For example, ali_drw_21000 and ali_drw_21080 are two PMU devices for two
|
||||||
|
sub-channels of the same channel in die 0. And the PMU device of die 1 is
|
||||||
|
prefixed with ali_drw_400XXXXX, e.g. ali_drw_40021000.
|
||||||
|
|
||||||
|
Each sub-channel has 36 PMU counters in total, which is classified into
|
||||||
|
four groups:
|
||||||
|
|
||||||
|
- Group 0: PMU Cycle Counter. This group has one pair of counters
|
||||||
|
pmu_cycle_cnt_low and pmu_cycle_cnt_high, that is used as the cycle count
|
||||||
|
based on DDRC core clock.
|
||||||
|
|
||||||
|
- Group 1: PMU Bandwidth Counters. This group has 8 counters that are used
|
||||||
|
to count the total access number of either the eight bank groups in a
|
||||||
|
selected rank, or four ranks separately in the first 4 counters. The base
|
||||||
|
transfer unit is 64B.
|
||||||
|
|
||||||
|
- Group 2: PMU Retry Counters. This group has 10 counters, that intend to
|
||||||
|
count the total retry number of each type of uncorrectable error.
|
||||||
|
|
||||||
|
- Group 3: PMU Common Counters. This group has 16 counters, that are used
|
||||||
|
to count the common events.
|
||||||
|
|
||||||
|
For now, the Driveway PMU driver only uses counters in group 0 and group 3.
|
||||||
|
|
||||||
|
The DDR Controller (DDRCTL) and DDR PHY combine to create a complete solution
|
||||||
|
for connecting an SoC application bus to DDR memory devices. The DDRCTL
|
||||||
|
receives transactions Host Interface (HIF) which is custom-defined by Synopsys.
|
||||||
|
These transactions are queued internally and scheduled for access while
|
||||||
|
satisfying the SDRAM protocol timing requirements, transaction priorities, and
|
||||||
|
dependencies between the transactions. The DDRCTL in turn issues commands on
|
||||||
|
the DDR PHY Interface (DFI) to the PHY module, which launches and captures data
|
||||||
|
to and from the SDRAM. The driveway PMUs have hardware logic to gather
|
||||||
|
statistics and performance logging signals on HIF, DFI, etc.
|
||||||
|
|
||||||
|
By counting the READ, WRITE and RMW commands sent to the DDRC through the HIF
|
||||||
|
interface, we could calculate the bandwidth. Example usage of counting memory
|
||||||
|
data bandwidth::
|
||||||
|
|
||||||
|
perf stat \
|
||||||
|
-e ali_drw_21000/hif_wr/ \
|
||||||
|
-e ali_drw_21000/hif_rd/ \
|
||||||
|
-e ali_drw_21000/hif_rmw/ \
|
||||||
|
-e ali_drw_21000/cycle/ \
|
||||||
|
-e ali_drw_21080/hif_wr/ \
|
||||||
|
-e ali_drw_21080/hif_rd/ \
|
||||||
|
-e ali_drw_21080/hif_rmw/ \
|
||||||
|
-e ali_drw_21080/cycle/ \
|
||||||
|
-e ali_drw_23000/hif_wr/ \
|
||||||
|
-e ali_drw_23000/hif_rd/ \
|
||||||
|
-e ali_drw_23000/hif_rmw/ \
|
||||||
|
-e ali_drw_23000/cycle/ \
|
||||||
|
-e ali_drw_23080/hif_wr/ \
|
||||||
|
-e ali_drw_23080/hif_rd/ \
|
||||||
|
-e ali_drw_23080/hif_rmw/ \
|
||||||
|
-e ali_drw_23080/cycle/ \
|
||||||
|
-e ali_drw_25000/hif_wr/ \
|
||||||
|
-e ali_drw_25000/hif_rd/ \
|
||||||
|
-e ali_drw_25000/hif_rmw/ \
|
||||||
|
-e ali_drw_25000/cycle/ \
|
||||||
|
-e ali_drw_25080/hif_wr/ \
|
||||||
|
-e ali_drw_25080/hif_rd/ \
|
||||||
|
-e ali_drw_25080/hif_rmw/ \
|
||||||
|
-e ali_drw_25080/cycle/ \
|
||||||
|
-e ali_drw_27000/hif_wr/ \
|
||||||
|
-e ali_drw_27000/hif_rd/ \
|
||||||
|
-e ali_drw_27000/hif_rmw/ \
|
||||||
|
-e ali_drw_27000/cycle/ \
|
||||||
|
-e ali_drw_27080/hif_wr/ \
|
||||||
|
-e ali_drw_27080/hif_rd/ \
|
||||||
|
-e ali_drw_27080/hif_rmw/ \
|
||||||
|
-e ali_drw_27080/cycle/ -- sleep 10
|
||||||
|
|
||||||
|
The average DRAM bandwidth can be calculated as follows:
|
||||||
|
|
||||||
|
- Read Bandwidth = perf_hif_rd * DDRC_WIDTH * DDRC_Freq / DDRC_Cycle
|
||||||
|
- Write Bandwidth = (perf_hif_wr + perf_hif_rmw) * DDRC_WIDTH * DDRC_Freq / DDRC_Cycle
|
||||||
|
|
||||||
|
Here, DDRC_WIDTH = 64 bytes.
|
||||||
|
|
||||||
|
The current driver does not support sampling. So "perf record" is
|
||||||
|
unsupported. Also attach to a task is unsupported as the events are all
|
||||||
|
uncore.
|
||||||
@@ -18,3 +18,4 @@ Performance monitor support
|
|||||||
xgene-pmu
|
xgene-pmu
|
||||||
arm_dsu_pmu
|
arm_dsu_pmu
|
||||||
thunderx2-pmu
|
thunderx2-pmu
|
||||||
|
alibaba_pmu
|
||||||
|
|||||||
@@ -182,6 +182,7 @@ to the ``struct sugov_cpu`` that the utilization update belongs to.
|
|||||||
Then, ``amd-pstate`` updates the desired performance according to the CPU
|
Then, ``amd-pstate`` updates the desired performance according to the CPU
|
||||||
scheduler assigned.
|
scheduler assigned.
|
||||||
|
|
||||||
|
.. _processor_support:
|
||||||
|
|
||||||
Processor Support
|
Processor Support
|
||||||
=======================
|
=======================
|
||||||
@@ -282,6 +283,8 @@ efficiency frequency management method on AMD processors.
|
|||||||
Kernel Module Options for ``amd-pstate``
|
Kernel Module Options for ``amd-pstate``
|
||||||
=========================================
|
=========================================
|
||||||
|
|
||||||
|
.. _shared_mem:
|
||||||
|
|
||||||
``shared_mem``
|
``shared_mem``
|
||||||
Use a module param (shared_mem) to enable related processors manually with
|
Use a module param (shared_mem) to enable related processors manually with
|
||||||
**amd_pstate.shared_mem=1**.
|
**amd_pstate.shared_mem=1**.
|
||||||
@@ -393,6 +396,76 @@ about part of the output. ::
|
|||||||
CPU_005 712 116384 39 49 166 0.7565 9645075 2214891 38431470 25.1 11.646 469 2.496 kworker/5:0-40
|
CPU_005 712 116384 39 49 166 0.7565 9645075 2214891 38431470 25.1 11.646 469 2.496 kworker/5:0-40
|
||||||
CPU_006 712 116408 39 49 166 0.6769 8950227 1839034 37192089 24.06 11.272 470 2.496 kworker/6:0-1264
|
CPU_006 712 116408 39 49 166 0.6769 8950227 1839034 37192089 24.06 11.272 470 2.496 kworker/6:0-1264
|
||||||
|
|
||||||
|
Unit Tests for amd-pstate
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
``amd-pstate-ut`` is a test module for testing the ``amd-pstate`` driver.
|
||||||
|
|
||||||
|
* It can help all users to verify their processor support (SBIOS/Firmware or Hardware).
|
||||||
|
|
||||||
|
* Kernel can have a basic function test to avoid the kernel regression during the update.
|
||||||
|
|
||||||
|
* We can introduce more functional or performance tests to align the result together, it will benefit power and performance scale optimization.
|
||||||
|
|
||||||
|
1. Test case decriptions
|
||||||
|
|
||||||
|
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||||
|
| Index | Functions | Description |
|
||||||
|
+=========+================================+====================================================================================+
|
||||||
|
| 0 | amd_pstate_ut_acpi_cpc_valid || Check whether the _CPC object is present in SBIOS. |
|
||||||
|
| | || |
|
||||||
|
| | || The detail refer to `Processor Support <processor_support_>`_. |
|
||||||
|
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||||
|
| 1 | amd_pstate_ut_check_enabled || Check whether AMD P-State is enabled. |
|
||||||
|
| | || |
|
||||||
|
| | || AMD P-States and ACPI hardware P-States always can be supported in one processor. |
|
||||||
|
| | | But AMD P-States has the higher priority and if it is enabled with |
|
||||||
|
| | | :c:macro:`MSR_AMD_CPPC_ENABLE` or ``cppc_set_enable``, it will respond to the |
|
||||||
|
| | | request from AMD P-States. |
|
||||||
|
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||||
|
| 2 | amd_pstate_ut_check_perf || Check if the each performance values are reasonable. |
|
||||||
|
| | || highest_perf >= nominal_perf > lowest_nonlinear_perf > lowest_perf > 0. |
|
||||||
|
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||||
|
| 3 | amd_pstate_ut_check_freq || Check if the each frequency values and max freq when set support boost mode |
|
||||||
|
| | | are reasonable. |
|
||||||
|
| | || max_freq >= nominal_freq > lowest_nonlinear_freq > min_freq > 0 |
|
||||||
|
| | || If boost is not active but supported, this maximum frequency will be larger than |
|
||||||
|
| | | the one in ``cpuinfo``. |
|
||||||
|
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||||
|
|
||||||
|
#. How to execute the tests
|
||||||
|
|
||||||
|
We use test module in the kselftest frameworks to implement it.
|
||||||
|
We create amd-pstate-ut module and tie it into kselftest.(for
|
||||||
|
details refer to Linux Kernel Selftests [4]_).
|
||||||
|
|
||||||
|
1. Build
|
||||||
|
|
||||||
|
+ open the :c:macro:`CONFIG_X86_AMD_PSTATE` configuration option.
|
||||||
|
+ set the :c:macro:`CONFIG_X86_AMD_PSTATE_UT` configuration option to M.
|
||||||
|
+ make project
|
||||||
|
+ make selftest ::
|
||||||
|
|
||||||
|
$ cd linux
|
||||||
|
$ make -C tools/testing/selftests
|
||||||
|
|
||||||
|
#. Installation & Steps ::
|
||||||
|
|
||||||
|
$ make -C tools/testing/selftests install INSTALL_PATH=~/kselftest
|
||||||
|
$ sudo ./kselftest/run_kselftest.sh -c amd-pstate
|
||||||
|
TAP version 13
|
||||||
|
1..1
|
||||||
|
# selftests: amd-pstate: amd-pstate-ut.sh
|
||||||
|
# amd-pstate-ut: ok
|
||||||
|
ok 1 selftests: amd-pstate: amd-pstate-ut.sh
|
||||||
|
|
||||||
|
#. Results ::
|
||||||
|
|
||||||
|
$ dmesg | grep "amd_pstate_ut" | tee log.txt
|
||||||
|
[12977.570663] amd_pstate_ut: 1 amd_pstate_ut_acpi_cpc_valid success!
|
||||||
|
[12977.570673] amd_pstate_ut: 2 amd_pstate_ut_check_enabled success!
|
||||||
|
[12977.571207] amd_pstate_ut: 3 amd_pstate_ut_check_perf success!
|
||||||
|
[12977.571212] amd_pstate_ut: 4 amd_pstate_ut_check_freq success!
|
||||||
|
|
||||||
Reference
|
Reference
|
||||||
===========
|
===========
|
||||||
@@ -405,3 +478,6 @@ Reference
|
|||||||
|
|
||||||
.. [3] Processor Programming Reference (PPR) for AMD Family 19h Model 51h, Revision A1 Processors
|
.. [3] Processor Programming Reference (PPR) for AMD Family 19h Model 51h, Revision A1 Processors
|
||||||
https://www.amd.com/system/files/TechDocs/56569-A1-PUB.zip
|
https://www.amd.com/system/files/TechDocs/56569-A1-PUB.zip
|
||||||
|
|
||||||
|
.. [4] Linux Kernel Selftests,
|
||||||
|
https://www.kernel.org/doc/html/latest/dev-tools/kselftest.html
|
||||||
|
|||||||
@@ -65,6 +65,11 @@ combining the following values:
|
|||||||
4 s3_beep
|
4 s3_beep
|
||||||
= =======
|
= =======
|
||||||
|
|
||||||
|
arch
|
||||||
|
====
|
||||||
|
|
||||||
|
The machine hardware name, the same output as ``uname -m``
|
||||||
|
(e.g. ``x86_64`` or ``aarch64``).
|
||||||
|
|
||||||
auto_msgmni
|
auto_msgmni
|
||||||
===========
|
===========
|
||||||
@@ -635,6 +640,17 @@ different types of memory (represented as different NUMA nodes) to
|
|||||||
place the hot pages in the fast memory. This is implemented based on
|
place the hot pages in the fast memory. This is implemented based on
|
||||||
unmapping and page fault too.
|
unmapping and page fault too.
|
||||||
|
|
||||||
|
numa_balancing_promote_rate_limit_MBps
|
||||||
|
======================================
|
||||||
|
|
||||||
|
Too high promotion/demotion throughput between different memory types
|
||||||
|
may hurt application latency. This can be used to rate limit the
|
||||||
|
promotion throughput. The per-node max promotion throughput in MB/s
|
||||||
|
will be limited to be no more than the set value.
|
||||||
|
|
||||||
|
A rule of thumb is to set this to less than 1/10 of the PMEM node
|
||||||
|
write bandwidth.
|
||||||
|
|
||||||
oops_all_cpu_backtrace
|
oops_all_cpu_backtrace
|
||||||
======================
|
======================
|
||||||
|
|
||||||
|
|||||||
@@ -31,17 +31,18 @@ see only some of them, depending on your kernel's configuration.
|
|||||||
|
|
||||||
Table : Subdirectories in /proc/sys/net
|
Table : Subdirectories in /proc/sys/net
|
||||||
|
|
||||||
========= =================== = ========== ==================
|
========= =================== = ========== ===================
|
||||||
Directory Content Directory Content
|
Directory Content Directory Content
|
||||||
========= =================== = ========== ==================
|
========= =================== = ========== ===================
|
||||||
core General parameter appletalk Appletalk protocol
|
802 E802 protocol mptcp Multipath TCP
|
||||||
unix Unix domain sockets netrom NET/ROM
|
appletalk Appletalk protocol netfilter Network Filter
|
||||||
802 E802 protocol ax25 AX25
|
ax25 AX25 netrom NET/ROM
|
||||||
ethernet Ethernet protocol rose X.25 PLP layer
|
bridge Bridging rose X.25 PLP layer
|
||||||
|
core General parameter tipc TIPC
|
||||||
|
ethernet Ethernet protocol unix Unix domain sockets
|
||||||
ipv4 IP version 4 x25 X.25 protocol
|
ipv4 IP version 4 x25 X.25 protocol
|
||||||
bridge Bridging decnet DEC net
|
ipv6 IP version 6
|
||||||
ipv6 IP version 6 tipc TIPC
|
========= =================== = ========== ===================
|
||||||
========= =================== = ========== ==================
|
|
||||||
|
|
||||||
1. /proc/sys/net/core - Network core options
|
1. /proc/sys/net/core - Network core options
|
||||||
============================================
|
============================================
|
||||||
@@ -101,6 +102,9 @@ Values:
|
|||||||
- 1 - enable JIT hardening for unprivileged users only
|
- 1 - enable JIT hardening for unprivileged users only
|
||||||
- 2 - enable JIT hardening for all users
|
- 2 - enable JIT hardening for all users
|
||||||
|
|
||||||
|
where "privileged user" in this context means a process having
|
||||||
|
CAP_BPF or CAP_SYS_ADMIN in the root user name space.
|
||||||
|
|
||||||
bpf_jit_kallsyms
|
bpf_jit_kallsyms
|
||||||
----------------
|
----------------
|
||||||
|
|
||||||
|
|||||||
@@ -926,6 +926,9 @@ calls without any restrictions.
|
|||||||
|
|
||||||
The default value is 0.
|
The default value is 0.
|
||||||
|
|
||||||
|
Another way to control permissions for userfaultfd is to use
|
||||||
|
/dev/userfaultfd instead of userfaultfd(2). See
|
||||||
|
Documentation/admin-guide/mm/userfaultfd.rst.
|
||||||
|
|
||||||
user_reserve_kbytes
|
user_reserve_kbytes
|
||||||
===================
|
===================
|
||||||
|
|||||||
@@ -134,6 +134,12 @@ More detailed explanation for tainting
|
|||||||
scsi/snic on something else than x86_64, scsi/ips on non
|
scsi/snic on something else than x86_64, scsi/ips on non
|
||||||
x86/x86_64/itanium, have broken firmware settings for the
|
x86/x86_64/itanium, have broken firmware settings for the
|
||||||
irqchip/irq-gic on arm64 ...).
|
irqchip/irq-gic on arm64 ...).
|
||||||
|
- x86/x86_64: Microcode late loading is dangerous and will result in
|
||||||
|
tainting the kernel. It requires that all CPUs rendezvous to make sure
|
||||||
|
the update happens when the system is as quiescent as possible. However,
|
||||||
|
a higher priority MCE/SMI/NMI can move control flow away from that
|
||||||
|
rendezvous and interrupt the update, which can be detrimental to the
|
||||||
|
machine.
|
||||||
|
|
||||||
3) ``R`` if a module was force unloaded by ``rmmod -f``, ``' '`` if all
|
3) ``R`` if a module was force unloaded by ``rmmod -f``, ``' '`` if all
|
||||||
modules were unloaded normally.
|
modules were unloaded normally.
|
||||||
|
|||||||
@@ -59,6 +59,7 @@ SoC-specific documents
|
|||||||
stm32/stm32f429-overview
|
stm32/stm32f429-overview
|
||||||
stm32/stm32mp13-overview
|
stm32/stm32mp13-overview
|
||||||
stm32/stm32mp157-overview
|
stm32/stm32mp157-overview
|
||||||
|
stm32/stm32-dma-mdma-chaining
|
||||||
|
|
||||||
sunxi
|
sunxi
|
||||||
|
|
||||||
|
|||||||
415
Documentation/arm/stm32/stm32-dma-mdma-chaining.rst
Normal file
415
Documentation/arm/stm32/stm32-dma-mdma-chaining.rst
Normal file
@@ -0,0 +1,415 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
=======================
|
||||||
|
STM32 DMA-MDMA chaining
|
||||||
|
=======================
|
||||||
|
|
||||||
|
|
||||||
|
Introduction
|
||||||
|
------------
|
||||||
|
|
||||||
|
This document describes the STM32 DMA-MDMA chaining feature. But before going
|
||||||
|
further, let's introduce the peripherals involved.
|
||||||
|
|
||||||
|
To offload data transfers from the CPU, STM32 microprocessors (MPUs) embed
|
||||||
|
direct memory access controllers (DMA).
|
||||||
|
|
||||||
|
STM32MP1 SoCs embed both STM32 DMA and STM32 MDMA controllers. STM32 DMA
|
||||||
|
request routing capabilities are enhanced by a DMA request multiplexer
|
||||||
|
(STM32 DMAMUX).
|
||||||
|
|
||||||
|
**STM32 DMAMUX**
|
||||||
|
|
||||||
|
STM32 DMAMUX routes any DMA request from a given peripheral to any STM32 DMA
|
||||||
|
controller (STM32MP1 counts two STM32 DMA controllers) channels.
|
||||||
|
|
||||||
|
**STM32 DMA**
|
||||||
|
|
||||||
|
STM32 DMA is mainly used to implement central data buffer storage (usually in
|
||||||
|
the system SRAM) for different peripheral. It can access external RAMs but
|
||||||
|
without the ability to generate convenient burst transfer ensuring the best
|
||||||
|
load of the AXI.
|
||||||
|
|
||||||
|
**STM32 MDMA**
|
||||||
|
|
||||||
|
STM32 MDMA (Master DMA) is mainly used to manage direct data transfers between
|
||||||
|
RAM data buffers without CPU intervention. It can also be used in a
|
||||||
|
hierarchical structure that uses STM32 DMA as first level data buffer
|
||||||
|
interfaces for AHB peripherals, while the STM32 MDMA acts as a second level
|
||||||
|
DMA with better performance. As a AXI/AHB master, STM32 MDMA can take control
|
||||||
|
of the AXI/AHB bus.
|
||||||
|
|
||||||
|
|
||||||
|
Principles
|
||||||
|
----------
|
||||||
|
|
||||||
|
STM32 DMA-MDMA chaining feature relies on the strengths of STM32 DMA and
|
||||||
|
STM32 MDMA controllers.
|
||||||
|
|
||||||
|
STM32 DMA has a circular Double Buffer Mode (DBM). At each end of transaction
|
||||||
|
(when DMA data counter - DMA_SxNDTR - reaches 0), the memory pointers
|
||||||
|
(configured with DMA_SxSM0AR and DMA_SxM1AR) are swapped and the DMA data
|
||||||
|
counter is automatically reloaded. This allows the SW or the STM32 MDMA to
|
||||||
|
process one memory area while the second memory area is being filled/used by
|
||||||
|
the STM32 DMA transfer.
|
||||||
|
|
||||||
|
With STM32 MDMA linked-list mode, a single request initiates the data array
|
||||||
|
(collection of nodes) to be transferred until the linked-list pointer for the
|
||||||
|
channel is null. The channel transfer complete of the last node is the end of
|
||||||
|
transfer, unless first and last nodes are linked to each other, in such a
|
||||||
|
case, the linked-list loops on to create a circular MDMA transfer.
|
||||||
|
|
||||||
|
STM32 MDMA has direct connections with STM32 DMA. This enables autonomous
|
||||||
|
communication and synchronization between peripherals, thus saving CPU
|
||||||
|
resources and bus congestion. Transfer Complete signal of STM32 DMA channel
|
||||||
|
can triggers STM32 MDMA transfer. STM32 MDMA can clear the request generated
|
||||||
|
by the STM32 DMA by writing to its Interrupt Clear register (whose address is
|
||||||
|
stored in MDMA_CxMAR, and bit mask in MDMA_CxMDR).
|
||||||
|
|
||||||
|
.. table:: STM32 MDMA interconnect table with STM32 DMA
|
||||||
|
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| STM32 DMAMUX | STM32 DMA | STM32 DMA | STM32 MDMA |
|
||||||
|
| channels | channels | Transfer | request |
|
||||||
|
| | | complete | |
|
||||||
|
| | | signal | |
|
||||||
|
+==============+================+===========+============+
|
||||||
|
| Channel *0* | DMA1 channel 0 | dma1_tcf0 | *0x00* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *1* | DMA1 channel 1 | dma1_tcf1 | *0x01* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *2* | DMA1 channel 2 | dma1_tcf2 | *0x02* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *3* | DMA1 channel 3 | dma1_tcf3 | *0x03* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *4* | DMA1 channel 4 | dma1_tcf4 | *0x04* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *5* | DMA1 channel 5 | dma1_tcf5 | *0x05* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *6* | DMA1 channel 6 | dma1_tcf6 | *0x06* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *7* | DMA1 channel 7 | dma1_tcf7 | *0x07* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *8* | DMA2 channel 0 | dma2_tcf0 | *0x08* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *9* | DMA2 channel 1 | dma2_tcf1 | *0x09* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *10* | DMA2 channel 2 | dma2_tcf2 | *0x0A* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *11* | DMA2 channel 3 | dma2_tcf3 | *0x0B* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *12* | DMA2 channel 4 | dma2_tcf4 | *0x0C* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *13* | DMA2 channel 5 | dma2_tcf5 | *0x0D* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *14* | DMA2 channel 6 | dma2_tcf6 | *0x0E* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
| Channel *15* | DMA2 channel 7 | dma2_tcf7 | *0x0F* |
|
||||||
|
+--------------+----------------+-----------+------------+
|
||||||
|
|
||||||
|
STM32 DMA-MDMA chaining feature then uses a SRAM buffer. STM32MP1 SoCs embed
|
||||||
|
three fast access static internal RAMs of various size, used for data storage.
|
||||||
|
Due to STM32 DMA legacy (within microcontrollers), STM32 DMA performances are
|
||||||
|
bad with DDR, while they are optimal with SRAM. Hence the SRAM buffer used
|
||||||
|
between STM32 DMA and STM32 MDMA. This buffer is split in two equal periods
|
||||||
|
and STM32 DMA uses one period while STM32 MDMA uses the other period
|
||||||
|
simultaneously.
|
||||||
|
::
|
||||||
|
|
||||||
|
dma[1:2]-tcf[0:7]
|
||||||
|
.----------------.
|
||||||
|
____________ ' _________ V____________
|
||||||
|
| STM32 DMA | / __|>_ \ | STM32 MDMA |
|
||||||
|
|------------| | / \ | |------------|
|
||||||
|
| DMA_SxM0AR |<=>| | SRAM | |<=>| []-[]...[] |
|
||||||
|
| DMA_SxM1AR | | \_____/ | | |
|
||||||
|
|____________| \___<|____/ |____________|
|
||||||
|
|
||||||
|
STM32 DMA-MDMA chaining uses (struct dma_slave_config).peripheral_config to
|
||||||
|
exchange the parameters needed to configure MDMA. These parameters are
|
||||||
|
gathered into a u32 array with three values:
|
||||||
|
|
||||||
|
* the STM32 MDMA request (which is actually the DMAMUX channel ID),
|
||||||
|
* the address of the STM32 DMA register to clear the Transfer Complete
|
||||||
|
interrupt flag,
|
||||||
|
* the mask of the Transfer Complete interrupt flag of the STM32 DMA channel.
|
||||||
|
|
||||||
|
Device Tree updates for STM32 DMA-MDMA chaining support
|
||||||
|
-------------------------------------------------------
|
||||||
|
|
||||||
|
**1. Allocate a SRAM buffer**
|
||||||
|
|
||||||
|
SRAM device tree node is defined in SoC device tree. You can refer to it in
|
||||||
|
your board device tree to define your SRAM pool.
|
||||||
|
::
|
||||||
|
|
||||||
|
&sram {
|
||||||
|
my_foo_device_dma_pool: dma-sram@0 {
|
||||||
|
reg = <0x0 0x1000>;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
Be careful of the start index, in case there are other SRAM consumers.
|
||||||
|
Define your pool size strategically: to optimise chaining, the idea is that
|
||||||
|
STM32 DMA and STM32 MDMA can work simultaneously, on each buffer of the
|
||||||
|
SRAM.
|
||||||
|
If the SRAM period is greater than the expected DMA transfer, then STM32 DMA
|
||||||
|
and STM32 MDMA will work sequentially instead of simultaneously. It is not a
|
||||||
|
functional issue but it is not optimal.
|
||||||
|
|
||||||
|
Don't forget to refer to your SRAM pool in your device node. You need to
|
||||||
|
define a new property.
|
||||||
|
::
|
||||||
|
|
||||||
|
&my_foo_device {
|
||||||
|
...
|
||||||
|
my_dma_pool = &my_foo_device_dma_pool;
|
||||||
|
};
|
||||||
|
|
||||||
|
Then get this SRAM pool in your foo driver and allocate your SRAM buffer.
|
||||||
|
|
||||||
|
**2. Allocate a STM32 DMA channel and a STM32 MDMA channel**
|
||||||
|
|
||||||
|
You need to define an extra channel in your device tree node, in addition to
|
||||||
|
the one you should already have for "classic" DMA operation.
|
||||||
|
|
||||||
|
This new channel must be taken from STM32 MDMA channels, so, the phandle of
|
||||||
|
the DMA controller to use is the MDMA controller's one.
|
||||||
|
::
|
||||||
|
|
||||||
|
&my_foo_device {
|
||||||
|
[...]
|
||||||
|
my_dma_pool = &my_foo_device_dma_pool;
|
||||||
|
dmas = <&dmamux1 ...>, // STM32 DMA channel
|
||||||
|
<&mdma1 0 0x3 0x1200000a 0 0>; // + STM32 MDMA channel
|
||||||
|
};
|
||||||
|
|
||||||
|
Concerning STM32 MDMA bindings:
|
||||||
|
|
||||||
|
1. The request line number : whatever the value here, it will be overwritten
|
||||||
|
by MDMA driver with the STM32 DMAMUX channel ID passed through
|
||||||
|
(struct dma_slave_config).peripheral_config
|
||||||
|
|
||||||
|
2. The priority level : choose Very High (0x3) so that your channel will
|
||||||
|
take priority other the other during request arbitration
|
||||||
|
|
||||||
|
3. A 32bit mask specifying the DMA channel configuration : source and
|
||||||
|
destination address increment, block transfer with 128 bytes per single
|
||||||
|
transfer
|
||||||
|
|
||||||
|
4. The 32bit value specifying the register to be used to acknowledge the
|
||||||
|
request: it will be overwritten by MDMA driver, with the DMA channel
|
||||||
|
interrupt flag clear register address passed through
|
||||||
|
(struct dma_slave_config).peripheral_config
|
||||||
|
|
||||||
|
5. The 32bit mask specifying the value to be written to acknowledge the
|
||||||
|
request: it will be overwritten by MDMA driver, with the DMA channel
|
||||||
|
Transfer Complete flag passed through
|
||||||
|
(struct dma_slave_config).peripheral_config
|
||||||
|
|
||||||
|
Driver updates for STM32 DMA-MDMA chaining support in foo driver
|
||||||
|
----------------------------------------------------------------
|
||||||
|
|
||||||
|
**0. (optional) Refactor the original sg_table if dmaengine_prep_slave_sg()**
|
||||||
|
|
||||||
|
In case of dmaengine_prep_slave_sg(), the original sg_table can't be used as
|
||||||
|
is. Two new sg_tables must be created from the original one. One for
|
||||||
|
STM32 DMA transfer (where memory address targets now the SRAM buffer instead
|
||||||
|
of DDR buffer) and one for STM32 MDMA transfer (where memory address targets
|
||||||
|
the DDR buffer).
|
||||||
|
|
||||||
|
The new sg_list items must fit SRAM period length. Here is an example for
|
||||||
|
DMA_DEV_TO_MEM:
|
||||||
|
::
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Assuming sgl and nents, respectively the initial scatterlist and its
|
||||||
|
* length.
|
||||||
|
* Assuming sram_dma_buf and sram_period, respectively the memory
|
||||||
|
* allocated from the pool for DMA usage, and the length of the period,
|
||||||
|
* which is half of the sram_buf size.
|
||||||
|
*/
|
||||||
|
struct sg_table new_dma_sgt, new_mdma_sgt;
|
||||||
|
struct scatterlist *s, *_sgl;
|
||||||
|
dma_addr_t ddr_dma_buf;
|
||||||
|
u32 new_nents = 0, len;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
/* Count the number of entries needed */
|
||||||
|
for_each_sg(sgl, s, nents, i)
|
||||||
|
if (sg_dma_len(s) > sram_period)
|
||||||
|
new_nents += DIV_ROUND_UP(sg_dma_len(s), sram_period);
|
||||||
|
else
|
||||||
|
new_nents++;
|
||||||
|
|
||||||
|
/* Create sg table for STM32 DMA channel */
|
||||||
|
ret = sg_alloc_table(&new_dma_sgt, new_nents, GFP_ATOMIC);
|
||||||
|
if (ret)
|
||||||
|
dev_err(dev, "DMA sg table alloc failed\n");
|
||||||
|
|
||||||
|
for_each_sg(new_dma_sgt.sgl, s, new_dma_sgt.nents, i) {
|
||||||
|
_sgl = sgl;
|
||||||
|
sg_dma_len(s) = min(sg_dma_len(_sgl), sram_period);
|
||||||
|
/* Targets the beginning = first half of the sram_buf */
|
||||||
|
s->dma_address = sram_buf;
|
||||||
|
/*
|
||||||
|
* Targets the second half of the sram_buf
|
||||||
|
* for odd indexes of the item of the sg_list
|
||||||
|
*/
|
||||||
|
if (i & 1)
|
||||||
|
s->dma_address += sram_period;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Create sg table for STM32 MDMA channel */
|
||||||
|
ret = sg_alloc_table(&new_mdma_sgt, new_nents, GFP_ATOMIC);
|
||||||
|
if (ret)
|
||||||
|
dev_err(dev, "MDMA sg_table alloc failed\n");
|
||||||
|
|
||||||
|
_sgl = sgl;
|
||||||
|
len = sg_dma_len(sgl);
|
||||||
|
ddr_dma_buf = sg_dma_address(sgl);
|
||||||
|
for_each_sg(mdma_sgt.sgl, s, mdma_sgt.nents, i) {
|
||||||
|
size_t bytes = min_t(size_t, len, sram_period);
|
||||||
|
|
||||||
|
sg_dma_len(s) = bytes;
|
||||||
|
sg_dma_address(s) = ddr_dma_buf;
|
||||||
|
len -= bytes;
|
||||||
|
|
||||||
|
if (!len && sg_next(_sgl)) {
|
||||||
|
_sgl = sg_next(_sgl);
|
||||||
|
len = sg_dma_len(_sgl);
|
||||||
|
ddr_dma_buf = sg_dma_address(_sgl);
|
||||||
|
} else {
|
||||||
|
ddr_dma_buf += bytes;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Don't forget to release these new sg_tables after getting the descriptors
|
||||||
|
with dmaengine_prep_slave_sg().
|
||||||
|
|
||||||
|
**1. Set controller specific parameters**
|
||||||
|
|
||||||
|
First, use dmaengine_slave_config() with a struct dma_slave_config to
|
||||||
|
configure STM32 DMA channel. You just have to take care of DMA addresses,
|
||||||
|
the memory address (depending on the transfer direction) must point on your
|
||||||
|
SRAM buffer, and set (struct dma_slave_config).peripheral_size != 0.
|
||||||
|
|
||||||
|
STM32 DMA driver will check (struct dma_slave_config).peripheral_size to
|
||||||
|
determine if chaining is being used or not. If it is used, then STM32 DMA
|
||||||
|
driver fills (struct dma_slave_config).peripheral_config with an array of
|
||||||
|
three u32 : the first one containing STM32 DMAMUX channel ID, the second one
|
||||||
|
the channel interrupt flag clear register address, and the third one the
|
||||||
|
channel Transfer Complete flag mask.
|
||||||
|
|
||||||
|
Then, use dmaengine_slave_config with another struct dma_slave_config to
|
||||||
|
configure STM32 MDMA channel. Take care of DMA addresses, the device address
|
||||||
|
(depending on the transfer direction) must point on your SRAM buffer, and
|
||||||
|
the memory address must point to the buffer originally used for "classic"
|
||||||
|
DMA operation. Use the previous (struct dma_slave_config).peripheral_size
|
||||||
|
and .peripheral_config that have been updated by STM32 DMA driver, to set
|
||||||
|
(struct dma_slave_config).peripheral_size and .peripheral_config of the
|
||||||
|
struct dma_slave_config to configure STM32 MDMA channel.
|
||||||
|
::
|
||||||
|
|
||||||
|
struct dma_slave_config dma_conf;
|
||||||
|
struct dma_slave_config mdma_conf;
|
||||||
|
|
||||||
|
memset(&dma_conf, 0, sizeof(dma_conf));
|
||||||
|
[...]
|
||||||
|
config.direction = DMA_DEV_TO_MEM;
|
||||||
|
config.dst_addr = sram_dma_buf; // SRAM buffer
|
||||||
|
config.peripheral_size = 1; // peripheral_size != 0 => chaining
|
||||||
|
|
||||||
|
dmaengine_slave_config(dma_chan, &dma_config);
|
||||||
|
|
||||||
|
memset(&mdma_conf, 0, sizeof(mdma_conf));
|
||||||
|
config.direction = DMA_DEV_TO_MEM;
|
||||||
|
mdma_conf.src_addr = sram_dma_buf; // SRAM buffer
|
||||||
|
mdma_conf.dst_addr = rx_dma_buf; // original memory buffer
|
||||||
|
mdma_conf.peripheral_size = dma_conf.peripheral_size; // <- dma_conf
|
||||||
|
mdma_conf.peripheral_config = dma_config.peripheral_config; // <- dma_conf
|
||||||
|
|
||||||
|
dmaengine_slave_config(mdma_chan, &mdma_conf);
|
||||||
|
|
||||||
|
**2. Get a descriptor for STM32 DMA channel transaction**
|
||||||
|
|
||||||
|
In the same way you get your descriptor for your "classic" DMA operation,
|
||||||
|
you just have to replace the original sg_list (in case of
|
||||||
|
dmaengine_prep_slave_sg()) with the new sg_list using SRAM buffer, or to
|
||||||
|
replace the original buffer address, length and period (in case of
|
||||||
|
dmaengine_prep_dma_cyclic()) with the new SRAM buffer.
|
||||||
|
|
||||||
|
**3. Get a descriptor for STM32 MDMA channel transaction**
|
||||||
|
|
||||||
|
If you previously get descriptor (for STM32 DMA) with
|
||||||
|
|
||||||
|
* dmaengine_prep_slave_sg(), then use dmaengine_prep_slave_sg() for
|
||||||
|
STM32 MDMA;
|
||||||
|
* dmaengine_prep_dma_cyclic(), then use dmaengine_prep_dma_cyclic() for
|
||||||
|
STM32 MDMA.
|
||||||
|
|
||||||
|
Use the new sg_list using SRAM buffer (in case of dmaengine_prep_slave_sg())
|
||||||
|
or, depending on the transfer direction, either the original DDR buffer (in
|
||||||
|
case of DMA_DEV_TO_MEM) or the SRAM buffer (in case of DMA_MEM_TO_DEV), the
|
||||||
|
source address being previously set with dmaengine_slave_config().
|
||||||
|
|
||||||
|
**4. Submit both transactions**
|
||||||
|
|
||||||
|
Before submitting your transactions, you may need to define on which
|
||||||
|
descriptor you want a callback to be called at the end of the transfer
|
||||||
|
(dmaengine_prep_slave_sg()) or the period (dmaengine_prep_dma_cyclic()).
|
||||||
|
Depending on the direction, set the callback on the descriptor that finishes
|
||||||
|
the overal transfer:
|
||||||
|
|
||||||
|
* DMA_DEV_TO_MEM: set the callback on the "MDMA" descriptor
|
||||||
|
* DMA_MEM_TO_DEV: set the callback on the "DMA" descriptor
|
||||||
|
|
||||||
|
Then, submit the descriptors whatever the order, with dmaengine_tx_submit().
|
||||||
|
|
||||||
|
**5. Issue pending requests (and wait for callback notification)**
|
||||||
|
|
||||||
|
As STM32 MDMA channel transfer is triggered by STM32 DMA, you must issue
|
||||||
|
STM32 MDMA channel before STM32 DMA channel.
|
||||||
|
|
||||||
|
If any, your callback will be called to warn you about the end of the overal
|
||||||
|
transfer or the period completion.
|
||||||
|
|
||||||
|
Don't forget to terminate both channels. STM32 DMA channel is configured in
|
||||||
|
cyclic Double-Buffer mode so it won't be disabled by HW, you need to terminate
|
||||||
|
it. STM32 MDMA channel will be stopped by HW in case of sg transfer, but not
|
||||||
|
in case of cyclic transfer. You can terminate it whatever the kind of transfer.
|
||||||
|
|
||||||
|
**STM32 DMA-MDMA chaining DMA_MEM_TO_DEV special case**
|
||||||
|
|
||||||
|
STM32 DMA-MDMA chaining in DMA_MEM_TO_DEV is a special case. Indeed, the
|
||||||
|
STM32 MDMA feeds the SRAM buffer with the DDR data, and the STM32 DMA reads
|
||||||
|
data from SRAM buffer. So some data (the first period) have to be copied in
|
||||||
|
SRAM buffer when the STM32 DMA starts to read.
|
||||||
|
|
||||||
|
A trick could be pausing the STM32 DMA channel (that will raise a Transfer
|
||||||
|
Complete signal, triggering the STM32 MDMA channel), but the first data read
|
||||||
|
by the STM32 DMA could be "wrong". The proper way is to prepare the first SRAM
|
||||||
|
period with dmaengine_prep_dma_memcpy(). Then this first period should be
|
||||||
|
"removed" from the sg or the cyclic transfer.
|
||||||
|
|
||||||
|
Due to this complexity, rather use the STM32 DMA-MDMA chaining for
|
||||||
|
DMA_DEV_TO_MEM and keep the "classic" DMA usage for DMA_MEM_TO_DEV, unless
|
||||||
|
you're not afraid.
|
||||||
|
|
||||||
|
Resources
|
||||||
|
---------
|
||||||
|
|
||||||
|
Application note, datasheet and reference manual are available on ST website
|
||||||
|
(STM32MP1_).
|
||||||
|
|
||||||
|
Dedicated focus on three application notes (AN5224_, AN4031_ & AN5001_)
|
||||||
|
dealing with STM32 DMAMUX, STM32 DMA and STM32 MDMA.
|
||||||
|
|
||||||
|
.. _STM32MP1: https://www.st.com/en/microcontrollers-microprocessors/stm32mp1-series.html
|
||||||
|
.. _AN5224: https://www.st.com/resource/en/application_note/an5224-stm32-dmamux-the-dma-request-router-stmicroelectronics.pdf
|
||||||
|
.. _AN4031: https://www.st.com/resource/en/application_note/dm00046011-using-the-stm32f2-stm32f4-and-stm32f7-series-dma-controller-stmicroelectronics.pdf
|
||||||
|
.. _AN5001: https://www.st.com/resource/en/application_note/an5001-stm32cube-expansion-package-for-stm32h7-series-mdma-stmicroelectronics.pdf
|
||||||
|
|
||||||
|
:Authors:
|
||||||
|
|
||||||
|
- Amelie Delaunay <amelie.delaunay@foss.st.com>
|
||||||
@@ -65,10 +65,6 @@ linux,uefi-mmap-desc-size 32-bit Size in bytes of each entry in the UEFI
|
|||||||
|
|
||||||
linux,uefi-mmap-desc-ver 32-bit Version of the mmap descriptor format.
|
linux,uefi-mmap-desc-ver 32-bit Version of the mmap descriptor format.
|
||||||
|
|
||||||
linux,initrd-start 64-bit Physical start address of an initrd
|
|
||||||
|
|
||||||
linux,initrd-end 64-bit Physical end address of an initrd
|
|
||||||
|
|
||||||
kaslr-seed 64-bit Entropy used to randomize the kernel image
|
kaslr-seed 64-bit Entropy used to randomize the kernel image
|
||||||
base address location.
|
base address location.
|
||||||
========================== ====== ===========================================
|
========================== ====== ===========================================
|
||||||
|
|||||||
@@ -242,46 +242,39 @@ HWCAP2_MTE3
|
|||||||
by Documentation/arm64/memory-tagging-extension.rst.
|
by Documentation/arm64/memory-tagging-extension.rst.
|
||||||
|
|
||||||
HWCAP2_SME
|
HWCAP2_SME
|
||||||
|
|
||||||
Functionality implied by ID_AA64PFR1_EL1.SME == 0b0001, as described
|
Functionality implied by ID_AA64PFR1_EL1.SME == 0b0001, as described
|
||||||
by Documentation/arm64/sme.rst.
|
by Documentation/arm64/sme.rst.
|
||||||
|
|
||||||
HWCAP2_SME_I16I64
|
HWCAP2_SME_I16I64
|
||||||
|
|
||||||
Functionality implied by ID_AA64SMFR0_EL1.I16I64 == 0b1111.
|
Functionality implied by ID_AA64SMFR0_EL1.I16I64 == 0b1111.
|
||||||
|
|
||||||
HWCAP2_SME_F64F64
|
HWCAP2_SME_F64F64
|
||||||
|
|
||||||
Functionality implied by ID_AA64SMFR0_EL1.F64F64 == 0b1.
|
Functionality implied by ID_AA64SMFR0_EL1.F64F64 == 0b1.
|
||||||
|
|
||||||
HWCAP2_SME_I8I32
|
HWCAP2_SME_I8I32
|
||||||
|
|
||||||
Functionality implied by ID_AA64SMFR0_EL1.I8I32 == 0b1111.
|
Functionality implied by ID_AA64SMFR0_EL1.I8I32 == 0b1111.
|
||||||
|
|
||||||
HWCAP2_SME_F16F32
|
HWCAP2_SME_F16F32
|
||||||
|
|
||||||
Functionality implied by ID_AA64SMFR0_EL1.F16F32 == 0b1.
|
Functionality implied by ID_AA64SMFR0_EL1.F16F32 == 0b1.
|
||||||
|
|
||||||
HWCAP2_SME_B16F32
|
HWCAP2_SME_B16F32
|
||||||
|
|
||||||
Functionality implied by ID_AA64SMFR0_EL1.B16F32 == 0b1.
|
Functionality implied by ID_AA64SMFR0_EL1.B16F32 == 0b1.
|
||||||
|
|
||||||
HWCAP2_SME_F32F32
|
HWCAP2_SME_F32F32
|
||||||
|
|
||||||
Functionality implied by ID_AA64SMFR0_EL1.F32F32 == 0b1.
|
Functionality implied by ID_AA64SMFR0_EL1.F32F32 == 0b1.
|
||||||
|
|
||||||
HWCAP2_SME_FA64
|
HWCAP2_SME_FA64
|
||||||
|
|
||||||
Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1.
|
Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1.
|
||||||
|
|
||||||
HWCAP2_WFXT
|
HWCAP2_WFXT
|
||||||
|
|
||||||
Functionality implied by ID_AA64ISAR2_EL1.WFXT == 0b0010.
|
Functionality implied by ID_AA64ISAR2_EL1.WFXT == 0b0010.
|
||||||
|
|
||||||
HWCAP2_EBF16
|
HWCAP2_EBF16
|
||||||
|
|
||||||
Functionality implied by ID_AA64ISAR1_EL1.BF16 == 0b0010.
|
Functionality implied by ID_AA64ISAR1_EL1.BF16 == 0b0010.
|
||||||
|
|
||||||
|
HWCAP2_SVE_EBF16
|
||||||
|
Functionality implied by ID_AA64ZFR0_EL1.BF16 == 0b0010.
|
||||||
|
|
||||||
4. Unused AT_HWCAP bits
|
4. Unused AT_HWCAP bits
|
||||||
-----------------------
|
-----------------------
|
||||||
|
|
||||||
|
|||||||
@@ -52,6 +52,8 @@ stable kernels.
|
|||||||
| Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 |
|
| Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 |
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
|
| ARM | Cortex-A510 | #2457168 | ARM64_ERRATUM_2457168 |
|
||||||
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| ARM | Cortex-A510 | #2064142 | ARM64_ERRATUM_2064142 |
|
| ARM | Cortex-A510 | #2064142 | ARM64_ERRATUM_2064142 |
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| ARM | Cortex-A510 | #2038923 | ARM64_ERRATUM_2038923 |
|
| ARM | Cortex-A510 | #2038923 | ARM64_ERRATUM_2038923 |
|
||||||
@@ -74,6 +76,8 @@ stable kernels.
|
|||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| ARM | Cortex-A55 | #1530923 | ARM64_ERRATUM_1530923 |
|
| ARM | Cortex-A55 | #1530923 | ARM64_ERRATUM_1530923 |
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
|
| ARM | Cortex-A55 | #2441007 | ARM64_ERRATUM_2441007 |
|
||||||
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 |
|
| ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 |
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| ARM | Cortex-A57 | #852523 | N/A |
|
| ARM | Cortex-A57 | #852523 | N/A |
|
||||||
@@ -108,6 +112,8 @@ stable kernels.
|
|||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| ARM | Cortex-A510 | #2441009 | ARM64_ERRATUM_2441009 |
|
| ARM | Cortex-A510 | #2441009 | ARM64_ERRATUM_2441009 |
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
|
| ARM | Cortex-A510 | #2658417 | ARM64_ERRATUM_2658417 |
|
||||||
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| ARM | Cortex-A710 | #2119858 | ARM64_ERRATUM_2119858 |
|
| ARM | Cortex-A710 | #2119858 | ARM64_ERRATUM_2119858 |
|
||||||
+----------------+-----------------+-----------------+-----------------------------+
|
+----------------+-----------------+-----------------+-----------------------------+
|
||||||
| ARM | Cortex-A710 | #2054223 | ARM64_ERRATUM_2054223 |
|
| ARM | Cortex-A710 | #2054223 | ARM64_ERRATUM_2054223 |
|
||||||
|
|||||||
@@ -331,6 +331,9 @@ The regset data starts with struct user_za_header, containing:
|
|||||||
been read if a PTRACE_GETREGSET of NT_ARM_ZA were executed for each thread
|
been read if a PTRACE_GETREGSET of NT_ARM_ZA were executed for each thread
|
||||||
when the coredump was generated.
|
when the coredump was generated.
|
||||||
|
|
||||||
|
* The NT_ARM_TLS note will be extended to two registers, the second register
|
||||||
|
will contain TPIDR2_EL0 on systems that support SME and will be read as
|
||||||
|
zero with writes ignored otherwise.
|
||||||
|
|
||||||
9. System runtime configuration
|
9. System runtime configuration
|
||||||
--------------------------------
|
--------------------------------
|
||||||
|
|||||||
@@ -111,7 +111,7 @@ the SVE instruction set architecture.
|
|||||||
|
|
||||||
* On syscall, V0..V31 are preserved (as without SVE). Thus, bits [127:0] of
|
* On syscall, V0..V31 are preserved (as without SVE). Thus, bits [127:0] of
|
||||||
Z0..Z31 are preserved. All other bits of Z0..Z31, and all of P0..P15 and FFR
|
Z0..Z31 are preserved. All other bits of Z0..Z31, and all of P0..P15 and FFR
|
||||||
become unspecified on return from a syscall.
|
become zero on return from a syscall.
|
||||||
|
|
||||||
* The SVE registers are not used to pass arguments to or receive results from
|
* The SVE registers are not used to pass arguments to or receive results from
|
||||||
any syscall.
|
any syscall.
|
||||||
@@ -452,6 +452,24 @@ The regset data starts with struct user_sve_header, containing:
|
|||||||
* Modifying the system default vector length does not affect the vector length
|
* Modifying the system default vector length does not affect the vector length
|
||||||
of any existing process or thread that does not make an execve() call.
|
of any existing process or thread that does not make an execve() call.
|
||||||
|
|
||||||
|
10. Perf extensions
|
||||||
|
--------------------------------
|
||||||
|
|
||||||
|
* The arm64 specific DWARF standard [5] added the VG (Vector Granule) register
|
||||||
|
at index 46. This register is used for DWARF unwinding when variable length
|
||||||
|
SVE registers are pushed onto the stack.
|
||||||
|
|
||||||
|
* Its value is equivalent to the current SVE vector length (VL) in bits divided
|
||||||
|
by 64.
|
||||||
|
|
||||||
|
* The value is included in Perf samples in the regs[46] field if
|
||||||
|
PERF_SAMPLE_REGS_USER is set and the sample_regs_user mask has bit 46 set.
|
||||||
|
|
||||||
|
* The value is the current value at the time the sample was taken, and it can
|
||||||
|
change over time.
|
||||||
|
|
||||||
|
* If the system doesn't support SVE when perf_event_open is called with these
|
||||||
|
settings, the event will fail to open.
|
||||||
|
|
||||||
Appendix A. SVE programmer's model (informative)
|
Appendix A. SVE programmer's model (informative)
|
||||||
=================================================
|
=================================================
|
||||||
@@ -593,3 +611,5 @@ References
|
|||||||
http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055c/IHI0055C_beta_aapcs64.pdf
|
http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055c/IHI0055C_beta_aapcs64.pdf
|
||||||
http://infocenter.arm.com/help/topic/com.arm.doc.subset.swdev.abi/index.html
|
http://infocenter.arm.com/help/topic/com.arm.doc.subset.swdev.abi/index.html
|
||||||
Procedure Call Standard for the ARM 64-bit Architecture (AArch64)
|
Procedure Call Standard for the ARM 64-bit Architecture (AArch64)
|
||||||
|
|
||||||
|
[5] https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst
|
||||||
|
|||||||
@@ -58,13 +58,11 @@ Like with atomic_t, the rule of thumb is:
|
|||||||
|
|
||||||
- RMW operations that have a return value are fully ordered.
|
- RMW operations that have a return value are fully ordered.
|
||||||
|
|
||||||
- RMW operations that are conditional are unordered on FAILURE,
|
- RMW operations that are conditional are fully ordered.
|
||||||
otherwise the above rules apply. In the case of test_and_set_bit_lock(),
|
|
||||||
if the bit in memory is unchanged by the operation then it is deemed to have
|
|
||||||
failed.
|
|
||||||
|
|
||||||
Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and
|
Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics,
|
||||||
clear_bit_unlock() which has RELEASE semantics.
|
clear_bit_unlock() which has RELEASE semantics and test_bit_acquire which has
|
||||||
|
ACQUIRE semantics.
|
||||||
|
|
||||||
Since a platform only has a single means of achieving atomic operations
|
Since a platform only has a single means of achieving atomic operations
|
||||||
the same barriers as for atomic_t are used, see atomic_t.txt.
|
the same barriers as for atomic_t are used, see atomic_t.txt.
|
||||||
|
|||||||
@@ -23,3 +23,4 @@ Block
|
|||||||
stat
|
stat
|
||||||
switching-sched
|
switching-sched
|
||||||
writeback_cache_control
|
writeback_cache_control
|
||||||
|
ublk
|
||||||
|
|||||||
253
Documentation/block/ublk.rst
Normal file
253
Documentation/block/ublk.rst
Normal file
@@ -0,0 +1,253 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
|
===========================================
|
||||||
|
Userspace block device driver (ublk driver)
|
||||||
|
===========================================
|
||||||
|
|
||||||
|
Overview
|
||||||
|
========
|
||||||
|
|
||||||
|
ublk is a generic framework for implementing block device logic from userspace.
|
||||||
|
The motivation behind it is that moving virtual block drivers into userspace,
|
||||||
|
such as loop, nbd and similar can be very helpful. It can help to implement
|
||||||
|
new virtual block device such as ublk-qcow2 (there are several attempts of
|
||||||
|
implementing qcow2 driver in kernel).
|
||||||
|
|
||||||
|
Userspace block devices are attractive because:
|
||||||
|
|
||||||
|
- They can be written many programming languages.
|
||||||
|
- They can use libraries that are not available in the kernel.
|
||||||
|
- They can be debugged with tools familiar to application developers.
|
||||||
|
- Crashes do not kernel panic the machine.
|
||||||
|
- Bugs are likely to have a lower security impact than bugs in kernel
|
||||||
|
code.
|
||||||
|
- They can be installed and updated independently of the kernel.
|
||||||
|
- They can be used to simulate block device easily with user specified
|
||||||
|
parameters/setting for test/debug purpose
|
||||||
|
|
||||||
|
ublk block device (``/dev/ublkb*``) is added by ublk driver. Any IO request
|
||||||
|
on the device will be forwarded to ublk userspace program. For convenience,
|
||||||
|
in this document, ``ublk server`` refers to generic ublk userspace
|
||||||
|
program. ``ublksrv`` [#userspace]_ is one of such implementation. It
|
||||||
|
provides ``libublksrv`` [#userspace_lib]_ library for developing specific
|
||||||
|
user block device conveniently, while also generic type block device is
|
||||||
|
included, such as loop and null. Richard W.M. Jones wrote userspace nbd device
|
||||||
|
``nbdublk`` [#userspace_nbdublk]_ based on ``libublksrv`` [#userspace_lib]_.
|
||||||
|
|
||||||
|
After the IO is handled by userspace, the result is committed back to the
|
||||||
|
driver, thus completing the request cycle. This way, any specific IO handling
|
||||||
|
logic is totally done by userspace, such as loop's IO handling, NBD's IO
|
||||||
|
communication, or qcow2's IO mapping.
|
||||||
|
|
||||||
|
``/dev/ublkb*`` is driven by blk-mq request-based driver. Each request is
|
||||||
|
assigned by one queue wide unique tag. ublk server assigns unique tag to each
|
||||||
|
IO too, which is 1:1 mapped with IO of ``/dev/ublkb*``.
|
||||||
|
|
||||||
|
Both the IO request forward and IO handling result committing are done via
|
||||||
|
``io_uring`` passthrough command; that is why ublk is also one io_uring based
|
||||||
|
block driver. It has been observed that using io_uring passthrough command can
|
||||||
|
give better IOPS than block IO; which is why ublk is one of high performance
|
||||||
|
implementation of userspace block device: not only IO request communication is
|
||||||
|
done by io_uring, but also the preferred IO handling in ublk server is io_uring
|
||||||
|
based approach too.
|
||||||
|
|
||||||
|
ublk provides control interface to set/get ublk block device parameters.
|
||||||
|
The interface is extendable and kabi compatible: basically any ublk request
|
||||||
|
queue's parameter or ublk generic feature parameters can be set/get via the
|
||||||
|
interface. Thus, ublk is generic userspace block device framework.
|
||||||
|
For example, it is easy to setup a ublk device with specified block
|
||||||
|
parameters from userspace.
|
||||||
|
|
||||||
|
Using ublk
|
||||||
|
==========
|
||||||
|
|
||||||
|
ublk requires userspace ublk server to handle real block device logic.
|
||||||
|
|
||||||
|
Below is example of using ``ublksrv`` to provide ublk-based loop device.
|
||||||
|
|
||||||
|
- add a device::
|
||||||
|
|
||||||
|
ublk add -t loop -f ublk-loop.img
|
||||||
|
|
||||||
|
- format with xfs, then use it::
|
||||||
|
|
||||||
|
mkfs.xfs /dev/ublkb0
|
||||||
|
mount /dev/ublkb0 /mnt
|
||||||
|
# do anything. all IOs are handled by io_uring
|
||||||
|
...
|
||||||
|
umount /mnt
|
||||||
|
|
||||||
|
- list the devices with their info::
|
||||||
|
|
||||||
|
ublk list
|
||||||
|
|
||||||
|
- delete the device::
|
||||||
|
|
||||||
|
ublk del -a
|
||||||
|
ublk del -n $ublk_dev_id
|
||||||
|
|
||||||
|
See usage details in README of ``ublksrv`` [#userspace_readme]_.
|
||||||
|
|
||||||
|
Design
|
||||||
|
======
|
||||||
|
|
||||||
|
Control plane
|
||||||
|
-------------
|
||||||
|
|
||||||
|
ublk driver provides global misc device node (``/dev/ublk-control``) for
|
||||||
|
managing and controlling ublk devices with help of several control commands:
|
||||||
|
|
||||||
|
- ``UBLK_CMD_ADD_DEV``
|
||||||
|
|
||||||
|
Add a ublk char device (``/dev/ublkc*``) which is talked with ublk server
|
||||||
|
WRT IO command communication. Basic device info is sent together with this
|
||||||
|
command. It sets UAPI structure of ``ublksrv_ctrl_dev_info``,
|
||||||
|
such as ``nr_hw_queues``, ``queue_depth``, and max IO request buffer size,
|
||||||
|
for which the info is negotiated with the driver and sent back to the server.
|
||||||
|
When this command is completed, the basic device info is immutable.
|
||||||
|
|
||||||
|
- ``UBLK_CMD_SET_PARAMS`` / ``UBLK_CMD_GET_PARAMS``
|
||||||
|
|
||||||
|
Set or get parameters of the device, which can be either generic feature
|
||||||
|
related, or request queue limit related, but can't be IO logic specific,
|
||||||
|
because the driver does not handle any IO logic. This command has to be
|
||||||
|
sent before sending ``UBLK_CMD_START_DEV``.
|
||||||
|
|
||||||
|
- ``UBLK_CMD_START_DEV``
|
||||||
|
|
||||||
|
After the server prepares userspace resources (such as creating per-queue
|
||||||
|
pthread & io_uring for handling ublk IO), this command is sent to the
|
||||||
|
driver for allocating & exposing ``/dev/ublkb*``. Parameters set via
|
||||||
|
``UBLK_CMD_SET_PARAMS`` are applied for creating the device.
|
||||||
|
|
||||||
|
- ``UBLK_CMD_STOP_DEV``
|
||||||
|
|
||||||
|
Halt IO on ``/dev/ublkb*`` and remove the device. When this command returns,
|
||||||
|
ublk server will release resources (such as destroying per-queue pthread &
|
||||||
|
io_uring).
|
||||||
|
|
||||||
|
- ``UBLK_CMD_DEL_DEV``
|
||||||
|
|
||||||
|
Remove ``/dev/ublkc*``. When this command returns, the allocated ublk device
|
||||||
|
number can be reused.
|
||||||
|
|
||||||
|
- ``UBLK_CMD_GET_QUEUE_AFFINITY``
|
||||||
|
|
||||||
|
When ``/dev/ublkc`` is added, the driver creates block layer tagset, so
|
||||||
|
that each queue's affinity info is available. The server sends
|
||||||
|
``UBLK_CMD_GET_QUEUE_AFFINITY`` to retrieve queue affinity info. It can
|
||||||
|
set up the per-queue context efficiently, such as bind affine CPUs with IO
|
||||||
|
pthread and try to allocate buffers in IO thread context.
|
||||||
|
|
||||||
|
- ``UBLK_CMD_GET_DEV_INFO``
|
||||||
|
|
||||||
|
For retrieving device info via ``ublksrv_ctrl_dev_info``. It is the server's
|
||||||
|
responsibility to save IO target specific info in userspace.
|
||||||
|
|
||||||
|
Data plane
|
||||||
|
----------
|
||||||
|
|
||||||
|
ublk server needs to create per-queue IO pthread & io_uring for handling IO
|
||||||
|
commands via io_uring passthrough. The per-queue IO pthread
|
||||||
|
focuses on IO handling and shouldn't handle any control & management
|
||||||
|
tasks.
|
||||||
|
|
||||||
|
The's IO is assigned by a unique tag, which is 1:1 mapping with IO
|
||||||
|
request of ``/dev/ublkb*``.
|
||||||
|
|
||||||
|
UAPI structure of ``ublksrv_io_desc`` is defined for describing each IO from
|
||||||
|
the driver. A fixed mmaped area (array) on ``/dev/ublkc*`` is provided for
|
||||||
|
exporting IO info to the server; such as IO offset, length, OP/flags and
|
||||||
|
buffer address. Each ``ublksrv_io_desc`` instance can be indexed via queue id
|
||||||
|
and IO tag directly.
|
||||||
|
|
||||||
|
The following IO commands are communicated via io_uring passthrough command,
|
||||||
|
and each command is only for forwarding the IO and committing the result
|
||||||
|
with specified IO tag in the command data:
|
||||||
|
|
||||||
|
- ``UBLK_IO_FETCH_REQ``
|
||||||
|
|
||||||
|
Sent from the server IO pthread for fetching future incoming IO requests
|
||||||
|
destined to ``/dev/ublkb*``. This command is sent only once from the server
|
||||||
|
IO pthread for ublk driver to setup IO forward environment.
|
||||||
|
|
||||||
|
- ``UBLK_IO_COMMIT_AND_FETCH_REQ``
|
||||||
|
|
||||||
|
When an IO request is destined to ``/dev/ublkb*``, the driver stores
|
||||||
|
the IO's ``ublksrv_io_desc`` to the specified mapped area; then the
|
||||||
|
previous received IO command of this IO tag (either ``UBLK_IO_FETCH_REQ``
|
||||||
|
or ``UBLK_IO_COMMIT_AND_FETCH_REQ)`` is completed, so the server gets
|
||||||
|
the IO notification via io_uring.
|
||||||
|
|
||||||
|
After the server handles the IO, its result is committed back to the
|
||||||
|
driver by sending ``UBLK_IO_COMMIT_AND_FETCH_REQ`` back. Once ublkdrv
|
||||||
|
received this command, it parses the result and complete the request to
|
||||||
|
``/dev/ublkb*``. In the meantime setup environment for fetching future
|
||||||
|
requests with the same IO tag. That is, ``UBLK_IO_COMMIT_AND_FETCH_REQ``
|
||||||
|
is reused for both fetching request and committing back IO result.
|
||||||
|
|
||||||
|
- ``UBLK_IO_NEED_GET_DATA``
|
||||||
|
|
||||||
|
With ``UBLK_F_NEED_GET_DATA`` enabled, the WRITE request will be firstly
|
||||||
|
issued to ublk server without data copy. Then, IO backend of ublk server
|
||||||
|
receives the request and it can allocate data buffer and embed its addr
|
||||||
|
inside this new io command. After the kernel driver gets the command,
|
||||||
|
data copy is done from request pages to this backend's buffer. Finally,
|
||||||
|
backend receives the request again with data to be written and it can
|
||||||
|
truly handle the request.
|
||||||
|
|
||||||
|
``UBLK_IO_NEED_GET_DATA`` adds one additional round-trip and one
|
||||||
|
io_uring_enter() syscall. Any user thinks that it may lower performance
|
||||||
|
should not enable UBLK_F_NEED_GET_DATA. ublk server pre-allocates IO
|
||||||
|
buffer for each IO by default. Any new project should try to use this
|
||||||
|
buffer to communicate with ublk driver. However, existing project may
|
||||||
|
break or not able to consume the new buffer interface; that's why this
|
||||||
|
command is added for backwards compatibility so that existing projects
|
||||||
|
can still consume existing buffers.
|
||||||
|
|
||||||
|
- data copy between ublk server IO buffer and ublk block IO request
|
||||||
|
|
||||||
|
The driver needs to copy the block IO request pages into the server buffer
|
||||||
|
(pages) first for WRITE before notifying the server of the coming IO, so
|
||||||
|
that the server can handle WRITE request.
|
||||||
|
|
||||||
|
When the server handles READ request and sends
|
||||||
|
``UBLK_IO_COMMIT_AND_FETCH_REQ`` to the server, ublkdrv needs to copy
|
||||||
|
the server buffer (pages) read to the IO request pages.
|
||||||
|
|
||||||
|
Future development
|
||||||
|
==================
|
||||||
|
|
||||||
|
Container-aware ublk deivice
|
||||||
|
----------------------------
|
||||||
|
|
||||||
|
ublk driver doesn't handle any IO logic. Its function is well defined
|
||||||
|
for now and very limited userspace interfaces are needed, which is also
|
||||||
|
well defined too. It is possible to make ublk devices container-aware block
|
||||||
|
devices in future as Stefan Hajnoczi suggested [#stefan]_, by removing
|
||||||
|
ADMIN privilege.
|
||||||
|
|
||||||
|
Zero copy
|
||||||
|
---------
|
||||||
|
|
||||||
|
Zero copy is a generic requirement for nbd, fuse or similar drivers. A
|
||||||
|
problem [#xiaoguang]_ Xiaoguang mentioned is that pages mapped to userspace
|
||||||
|
can't be remapped any more in kernel with existing mm interfaces. This can
|
||||||
|
occurs when destining direct IO to ``/dev/ublkb*``. Also, he reported that
|
||||||
|
big requests (IO size >= 256 KB) may benefit a lot from zero copy.
|
||||||
|
|
||||||
|
|
||||||
|
References
|
||||||
|
==========
|
||||||
|
|
||||||
|
.. [#userspace] https://github.com/ming1/ubdsrv
|
||||||
|
|
||||||
|
.. [#userspace_lib] https://github.com/ming1/ubdsrv/tree/master/lib
|
||||||
|
|
||||||
|
.. [#userspace_nbdublk] https://gitlab.com/rwmjones/libnbd/-/tree/nbdublk
|
||||||
|
|
||||||
|
.. [#userspace_readme] https://github.com/ming1/ubdsrv/blob/master/README
|
||||||
|
|
||||||
|
.. [#stefan] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/
|
||||||
|
|
||||||
|
.. [#xiaoguang] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/
|
||||||
30
Documentation/bpf/clang-notes.rst
Normal file
30
Documentation/bpf/clang-notes.rst
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
.. contents::
|
||||||
|
.. sectnum::
|
||||||
|
|
||||||
|
==========================
|
||||||
|
Clang implementation notes
|
||||||
|
==========================
|
||||||
|
|
||||||
|
This document provides more details specific to the Clang/LLVM implementation of the eBPF instruction set.
|
||||||
|
|
||||||
|
Versions
|
||||||
|
========
|
||||||
|
|
||||||
|
Clang defined "CPU" versions, where a CPU version of 3 corresponds to the current eBPF ISA.
|
||||||
|
|
||||||
|
Clang can select the eBPF ISA version using ``-mcpu=v3`` for example to select version 3.
|
||||||
|
|
||||||
|
Arithmetic instructions
|
||||||
|
=======================
|
||||||
|
|
||||||
|
For CPU versions prior to 3, Clang v7.0 and later can enable ``BPF_ALU`` support with
|
||||||
|
``-Xclang -target-feature -Xclang +alu32``. In CPU version 3, support is automatically included.
|
||||||
|
|
||||||
|
Atomic operations
|
||||||
|
=================
|
||||||
|
|
||||||
|
Clang can generate atomic instructions by default when ``-mcpu=v3`` is
|
||||||
|
enabled. If a lower version for ``-mcpu`` is set, the only atomic instruction
|
||||||
|
Clang can generate is ``BPF_ADD`` *without* ``BPF_FETCH``. If you need to enable
|
||||||
|
the atomics features, while keeping a lower ``-mcpu`` version, you can use
|
||||||
|
``-Xclang -target-feature -Xclang +alu32``.
|
||||||
@@ -26,6 +26,8 @@ that goes into great technical depth about the BPF Architecture.
|
|||||||
classic_vs_extended.rst
|
classic_vs_extended.rst
|
||||||
bpf_licensing
|
bpf_licensing
|
||||||
test_debug
|
test_debug
|
||||||
|
clang-notes
|
||||||
|
linux-notes
|
||||||
other
|
other
|
||||||
|
|
||||||
.. only:: subproject and html
|
.. only:: subproject and html
|
||||||
|
|||||||
@@ -1,7 +1,12 @@
|
|||||||
|
.. contents::
|
||||||
|
.. sectnum::
|
||||||
|
|
||||||
|
========================================
|
||||||
|
eBPF Instruction Set Specification, v1.0
|
||||||
|
========================================
|
||||||
|
|
||||||
|
This document specifies version 1.0 of the eBPF instruction set.
|
||||||
|
|
||||||
====================
|
|
||||||
eBPF Instruction Set
|
|
||||||
====================
|
|
||||||
|
|
||||||
Registers and calling convention
|
Registers and calling convention
|
||||||
================================
|
================================
|
||||||
@@ -44,24 +49,24 @@ Instruction classes
|
|||||||
|
|
||||||
The three LSB bits of the 'opcode' field store the instruction class:
|
The three LSB bits of the 'opcode' field store the instruction class:
|
||||||
|
|
||||||
========= ===== ===============================
|
========= ===== =============================== ===================================
|
||||||
class value description
|
class value description reference
|
||||||
========= ===== ===============================
|
========= ===== =============================== ===================================
|
||||||
BPF_LD 0x00 non-standard load operations
|
BPF_LD 0x00 non-standard load operations `Load and store instructions`_
|
||||||
BPF_LDX 0x01 load into register operations
|
BPF_LDX 0x01 load into register operations `Load and store instructions`_
|
||||||
BPF_ST 0x02 store from immediate operations
|
BPF_ST 0x02 store from immediate operations `Load and store instructions`_
|
||||||
BPF_STX 0x03 store from register operations
|
BPF_STX 0x03 store from register operations `Load and store instructions`_
|
||||||
BPF_ALU 0x04 32-bit arithmetic operations
|
BPF_ALU 0x04 32-bit arithmetic operations `Arithmetic and jump instructions`_
|
||||||
BPF_JMP 0x05 64-bit jump operations
|
BPF_JMP 0x05 64-bit jump operations `Arithmetic and jump instructions`_
|
||||||
BPF_JMP32 0x06 32-bit jump operations
|
BPF_JMP32 0x06 32-bit jump operations `Arithmetic and jump instructions`_
|
||||||
BPF_ALU64 0x07 64-bit arithmetic operations
|
BPF_ALU64 0x07 64-bit arithmetic operations `Arithmetic and jump instructions`_
|
||||||
========= ===== ===============================
|
========= ===== =============================== ===================================
|
||||||
|
|
||||||
Arithmetic and jump instructions
|
Arithmetic and jump instructions
|
||||||
================================
|
================================
|
||||||
|
|
||||||
For arithmetic and jump instructions (BPF_ALU, BPF_ALU64, BPF_JMP and
|
For arithmetic and jump instructions (``BPF_ALU``, ``BPF_ALU64``, ``BPF_JMP`` and
|
||||||
BPF_JMP32), the 8-bit 'opcode' field is divided into three parts:
|
``BPF_JMP32``), the 8-bit 'opcode' field is divided into three parts:
|
||||||
|
|
||||||
============== ====== =================
|
============== ====== =================
|
||||||
4 bits (MSB) 1 bit 3 bits (LSB)
|
4 bits (MSB) 1 bit 3 bits (LSB)
|
||||||
@@ -84,13 +89,13 @@ The four MSB bits store the operation code.
|
|||||||
Arithmetic instructions
|
Arithmetic instructions
|
||||||
-----------------------
|
-----------------------
|
||||||
|
|
||||||
BPF_ALU uses 32-bit wide operands while BPF_ALU64 uses 64-bit wide operands for
|
``BPF_ALU`` uses 32-bit wide operands while ``BPF_ALU64`` uses 64-bit wide operands for
|
||||||
otherwise identical operations.
|
otherwise identical operations.
|
||||||
The code field encodes the operation as below:
|
The 'code' field encodes the operation as below:
|
||||||
|
|
||||||
======== ===== =================================================
|
======== ===== ==========================================================
|
||||||
code value description
|
code value description
|
||||||
======== ===== =================================================
|
======== ===== ==========================================================
|
||||||
BPF_ADD 0x00 dst += src
|
BPF_ADD 0x00 dst += src
|
||||||
BPF_SUB 0x10 dst -= src
|
BPF_SUB 0x10 dst -= src
|
||||||
BPF_MUL 0x20 dst \*= src
|
BPF_MUL 0x20 dst \*= src
|
||||||
@@ -104,36 +109,36 @@ The code field encodes the operation as below:
|
|||||||
BPF_XOR 0xa0 dst ^= src
|
BPF_XOR 0xa0 dst ^= src
|
||||||
BPF_MOV 0xb0 dst = src
|
BPF_MOV 0xb0 dst = src
|
||||||
BPF_ARSH 0xc0 sign extending shift right
|
BPF_ARSH 0xc0 sign extending shift right
|
||||||
BPF_END 0xd0 byte swap operations (see separate section below)
|
BPF_END 0xd0 byte swap operations (see `Byte swap instructions`_ below)
|
||||||
======== ===== =================================================
|
======== ===== ==========================================================
|
||||||
|
|
||||||
BPF_ADD | BPF_X | BPF_ALU means::
|
``BPF_ADD | BPF_X | BPF_ALU`` means::
|
||||||
|
|
||||||
dst_reg = (u32) dst_reg + (u32) src_reg;
|
dst_reg = (u32) dst_reg + (u32) src_reg;
|
||||||
|
|
||||||
BPF_ADD | BPF_X | BPF_ALU64 means::
|
``BPF_ADD | BPF_X | BPF_ALU64`` means::
|
||||||
|
|
||||||
dst_reg = dst_reg + src_reg
|
dst_reg = dst_reg + src_reg
|
||||||
|
|
||||||
BPF_XOR | BPF_K | BPF_ALU means::
|
``BPF_XOR | BPF_K | BPF_ALU`` means::
|
||||||
|
|
||||||
src_reg = (u32) src_reg ^ (u32) imm32
|
src_reg = (u32) src_reg ^ (u32) imm32
|
||||||
|
|
||||||
BPF_XOR | BPF_K | BPF_ALU64 means::
|
``BPF_XOR | BPF_K | BPF_ALU64`` means::
|
||||||
|
|
||||||
src_reg = src_reg ^ imm32
|
src_reg = src_reg ^ imm32
|
||||||
|
|
||||||
|
|
||||||
Byte swap instructions
|
Byte swap instructions
|
||||||
----------------------
|
~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
The byte swap instructions use an instruction class of ``BPF_ALU`` and a 4-bit
|
The byte swap instructions use an instruction class of ``BPF_ALU`` and a 4-bit
|
||||||
code field of ``BPF_END``.
|
'code' field of ``BPF_END``.
|
||||||
|
|
||||||
The byte swap instructions operate on the destination register
|
The byte swap instructions operate on the destination register
|
||||||
only and do not use a separate source register or immediate value.
|
only and do not use a separate source register or immediate value.
|
||||||
|
|
||||||
The 1-bit source operand field in the opcode is used to to select what byte
|
The 1-bit source operand field in the opcode is used to select what byte
|
||||||
order the operation convert from or to:
|
order the operation convert from or to:
|
||||||
|
|
||||||
========= ===== =================================================
|
========= ===== =================================================
|
||||||
@@ -143,7 +148,7 @@ order the operation convert from or to:
|
|||||||
BPF_TO_BE 0x08 convert between host byte order and big endian
|
BPF_TO_BE 0x08 convert between host byte order and big endian
|
||||||
========= ===== =================================================
|
========= ===== =================================================
|
||||||
|
|
||||||
The imm field encodes the width of the swap operations. The following widths
|
The 'imm' field encodes the width of the swap operations. The following widths
|
||||||
are supported: 16, 32 and 64.
|
are supported: 16, 32 and 64.
|
||||||
|
|
||||||
Examples:
|
Examples:
|
||||||
@@ -156,16 +161,12 @@ Examples:
|
|||||||
|
|
||||||
dst_reg = htobe64(dst_reg)
|
dst_reg = htobe64(dst_reg)
|
||||||
|
|
||||||
``BPF_FROM_LE`` and ``BPF_FROM_BE`` exist as aliases for ``BPF_TO_LE`` and
|
|
||||||
``BPF_TO_BE`` respectively.
|
|
||||||
|
|
||||||
|
|
||||||
Jump instructions
|
Jump instructions
|
||||||
-----------------
|
-----------------
|
||||||
|
|
||||||
BPF_JMP32 uses 32-bit wide operands while BPF_JMP uses 64-bit wide operands for
|
``BPF_JMP32`` uses 32-bit wide operands while ``BPF_JMP`` uses 64-bit wide operands for
|
||||||
otherwise identical operations.
|
otherwise identical operations.
|
||||||
The code field encodes the operation as below:
|
The 'code' field encodes the operation as below:
|
||||||
|
|
||||||
======== ===== ========================= ============
|
======== ===== ========================= ============
|
||||||
code value description notes
|
code value description notes
|
||||||
@@ -193,7 +194,7 @@ BPF_EXIT.
|
|||||||
Load and store instructions
|
Load and store instructions
|
||||||
===========================
|
===========================
|
||||||
|
|
||||||
For load and store instructions (BPF_LD, BPF_LDX, BPF_ST and BPF_STX), the
|
For load and store instructions (``BPF_LD``, ``BPF_LDX``, ``BPF_ST``, and ``BPF_STX``), the
|
||||||
8-bit 'opcode' field is divided as:
|
8-bit 'opcode' field is divided as:
|
||||||
|
|
||||||
============ ====== =================
|
============ ====== =================
|
||||||
@@ -202,6 +203,18 @@ For load and store instructions (BPF_LD, BPF_LDX, BPF_ST and BPF_STX), the
|
|||||||
mode size instruction class
|
mode size instruction class
|
||||||
============ ====== =================
|
============ ====== =================
|
||||||
|
|
||||||
|
The mode modifier is one of:
|
||||||
|
|
||||||
|
============= ===== ==================================== =============
|
||||||
|
mode modifier value description reference
|
||||||
|
============= ===== ==================================== =============
|
||||||
|
BPF_IMM 0x00 64-bit immediate instructions `64-bit immediate instructions`_
|
||||||
|
BPF_ABS 0x20 legacy BPF packet access (absolute) `Legacy BPF Packet access instructions`_
|
||||||
|
BPF_IND 0x40 legacy BPF packet access (indirect) `Legacy BPF Packet access instructions`_
|
||||||
|
BPF_MEM 0x60 regular load and store operations `Regular load and store operations`_
|
||||||
|
BPF_ATOMIC 0xc0 atomic operations `Atomic operations`_
|
||||||
|
============= ===== ==================================== =============
|
||||||
|
|
||||||
The size modifier is one of:
|
The size modifier is one of:
|
||||||
|
|
||||||
============= ===== =====================
|
============= ===== =====================
|
||||||
@@ -213,19 +226,6 @@ The size modifier is one of:
|
|||||||
BPF_DW 0x18 double word (8 bytes)
|
BPF_DW 0x18 double word (8 bytes)
|
||||||
============= ===== =====================
|
============= ===== =====================
|
||||||
|
|
||||||
The mode modifier is one of:
|
|
||||||
|
|
||||||
============= ===== ====================================
|
|
||||||
mode modifier value description
|
|
||||||
============= ===== ====================================
|
|
||||||
BPF_IMM 0x00 64-bit immediate instructions
|
|
||||||
BPF_ABS 0x20 legacy BPF packet access (absolute)
|
|
||||||
BPF_IND 0x40 legacy BPF packet access (indirect)
|
|
||||||
BPF_MEM 0x60 regular load and store operations
|
|
||||||
BPF_ATOMIC 0xc0 atomic operations
|
|
||||||
============= ===== ====================================
|
|
||||||
|
|
||||||
|
|
||||||
Regular load and store operations
|
Regular load and store operations
|
||||||
---------------------------------
|
---------------------------------
|
||||||
|
|
||||||
@@ -260,9 +260,9 @@ that use the ``BPF_ATOMIC`` mode modifier as follows:
|
|||||||
* ``BPF_ATOMIC | BPF_DW | BPF_STX`` for 64-bit operations
|
* ``BPF_ATOMIC | BPF_DW | BPF_STX`` for 64-bit operations
|
||||||
* 8-bit and 16-bit wide atomic operations are not supported.
|
* 8-bit and 16-bit wide atomic operations are not supported.
|
||||||
|
|
||||||
The imm field is used to encode the actual atomic operation.
|
The 'imm' field is used to encode the actual atomic operation.
|
||||||
Simple atomic operation use a subset of the values defined to encode
|
Simple atomic operation use a subset of the values defined to encode
|
||||||
arithmetic operations in the imm field to encode the atomic operation:
|
arithmetic operations in the 'imm' field to encode the atomic operation:
|
||||||
|
|
||||||
======== ===== ===========
|
======== ===== ===========
|
||||||
imm value description
|
imm value description
|
||||||
@@ -274,16 +274,14 @@ arithmetic operations in the imm field to encode the atomic operation:
|
|||||||
======== ===== ===========
|
======== ===== ===========
|
||||||
|
|
||||||
|
|
||||||
``BPF_ATOMIC | BPF_W | BPF_STX`` with imm = BPF_ADD means::
|
``BPF_ATOMIC | BPF_W | BPF_STX`` with 'imm' = BPF_ADD means::
|
||||||
|
|
||||||
*(u32 *)(dst_reg + off16) += src_reg
|
*(u32 *)(dst_reg + off16) += src_reg
|
||||||
|
|
||||||
``BPF_ATOMIC | BPF_DW | BPF_STX`` with imm = BPF ADD means::
|
``BPF_ATOMIC | BPF_DW | BPF_STX`` with 'imm' = BPF ADD means::
|
||||||
|
|
||||||
*(u64 *)(dst_reg + off16) += src_reg
|
*(u64 *)(dst_reg + off16) += src_reg
|
||||||
|
|
||||||
``BPF_XADD`` is a deprecated name for ``BPF_ATOMIC | BPF_ADD``.
|
|
||||||
|
|
||||||
In addition to the simple atomic operations, there also is a modifier and
|
In addition to the simple atomic operations, there also is a modifier and
|
||||||
two complex atomic operations:
|
two complex atomic operations:
|
||||||
|
|
||||||
@@ -309,16 +307,10 @@ The ``BPF_CMPXCHG`` operation atomically compares the value addressed by
|
|||||||
value that was at ``dst_reg + off`` before the operation is zero-extended
|
value that was at ``dst_reg + off`` before the operation is zero-extended
|
||||||
and loaded back to ``R0``.
|
and loaded back to ``R0``.
|
||||||
|
|
||||||
Clang can generate atomic instructions by default when ``-mcpu=v3`` is
|
|
||||||
enabled. If a lower version for ``-mcpu`` is set, the only atomic instruction
|
|
||||||
Clang can generate is ``BPF_ADD`` *without* ``BPF_FETCH``. If you need to enable
|
|
||||||
the atomics features, while keeping a lower ``-mcpu`` version, you can use
|
|
||||||
``-Xclang -target-feature -Xclang +alu32``.
|
|
||||||
|
|
||||||
64-bit immediate instructions
|
64-bit immediate instructions
|
||||||
-----------------------------
|
-----------------------------
|
||||||
|
|
||||||
Instructions with the ``BPF_IMM`` mode modifier use the wide instruction
|
Instructions with the ``BPF_IMM`` 'mode' modifier use the wide instruction
|
||||||
encoding for an extra imm64 value.
|
encoding for an extra imm64 value.
|
||||||
|
|
||||||
There is currently only one such instruction.
|
There is currently only one such instruction.
|
||||||
@@ -331,36 +323,6 @@ There is currently only one such instruction.
|
|||||||
Legacy BPF Packet access instructions
|
Legacy BPF Packet access instructions
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
|
|
||||||
eBPF has special instructions for access to packet data that have been
|
eBPF previously introduced special instructions for access to packet data that were
|
||||||
carried over from classic BPF to retain the performance of legacy socket
|
carried over from classic BPF. However, these instructions are
|
||||||
filters running in the eBPF interpreter.
|
deprecated and should no longer be used.
|
||||||
|
|
||||||
The instructions come in two forms: ``BPF_ABS | <size> | BPF_LD`` and
|
|
||||||
``BPF_IND | <size> | BPF_LD``.
|
|
||||||
|
|
||||||
These instructions are used to access packet data and can only be used when
|
|
||||||
the program context is a pointer to networking packet. ``BPF_ABS``
|
|
||||||
accesses packet data at an absolute offset specified by the immediate data
|
|
||||||
and ``BPF_IND`` access packet data at an offset that includes the value of
|
|
||||||
a register in addition to the immediate data.
|
|
||||||
|
|
||||||
These instructions have seven implicit operands:
|
|
||||||
|
|
||||||
* Register R6 is an implicit input that must contain pointer to a
|
|
||||||
struct sk_buff.
|
|
||||||
* Register R0 is an implicit output which contains the data fetched from
|
|
||||||
the packet.
|
|
||||||
* Registers R1-R5 are scratch registers that are clobbered after a call to
|
|
||||||
``BPF_ABS | BPF_LD`` or ``BPF_IND | BPF_LD`` instructions.
|
|
||||||
|
|
||||||
These instructions have an implicit program exit condition as well. When an
|
|
||||||
eBPF program is trying to access the data beyond the packet boundary, the
|
|
||||||
program execution will be aborted.
|
|
||||||
|
|
||||||
``BPF_ABS | BPF_W | BPF_LD`` means::
|
|
||||||
|
|
||||||
R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + imm32))
|
|
||||||
|
|
||||||
``BPF_IND | BPF_W | BPF_LD`` means::
|
|
||||||
|
|
||||||
R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32))
|
|
||||||
|
|||||||
@@ -137,14 +137,37 @@ KF_ACQUIRE and KF_RET_NULL flags.
|
|||||||
--------------------------
|
--------------------------
|
||||||
|
|
||||||
The KF_TRUSTED_ARGS flag is used for kfuncs taking pointer arguments. It
|
The KF_TRUSTED_ARGS flag is used for kfuncs taking pointer arguments. It
|
||||||
indicates that the all pointer arguments will always be refcounted, and have
|
indicates that the all pointer arguments will always have a guaranteed lifetime,
|
||||||
their offset set to 0. It can be used to enforce that a pointer to a refcounted
|
and pointers to kernel objects are always passed to helpers in their unmodified
|
||||||
object acquired from a kfunc or BPF helper is passed as an argument to this
|
form (as obtained from acquire kfuncs).
|
||||||
kfunc without any modifications (e.g. pointer arithmetic) such that it is
|
|
||||||
trusted and points to the original object. This flag is often used for kfuncs
|
It can be used to enforce that a pointer to a refcounted object acquired from a
|
||||||
that operate (change some property, perform some operation) on an object that
|
kfunc or BPF helper is passed as an argument to this kfunc without any
|
||||||
was obtained using an acquire kfunc. Such kfuncs need an unchanged pointer to
|
modifications (e.g. pointer arithmetic) such that it is trusted and points to
|
||||||
ensure the integrity of the operation being performed on the expected object.
|
the original object.
|
||||||
|
|
||||||
|
Meanwhile, it is also allowed pass pointers to normal memory to such kfuncs,
|
||||||
|
but those can have a non-zero offset.
|
||||||
|
|
||||||
|
This flag is often used for kfuncs that operate (change some property, perform
|
||||||
|
some operation) on an object that was obtained using an acquire kfunc. Such
|
||||||
|
kfuncs need an unchanged pointer to ensure the integrity of the operation being
|
||||||
|
performed on the expected object.
|
||||||
|
|
||||||
|
2.4.6 KF_SLEEPABLE flag
|
||||||
|
-----------------------
|
||||||
|
|
||||||
|
The KF_SLEEPABLE flag is used for kfuncs that may sleep. Such kfuncs can only
|
||||||
|
be called by sleepable BPF programs (BPF_F_SLEEPABLE).
|
||||||
|
|
||||||
|
2.4.7 KF_DESTRUCTIVE flag
|
||||||
|
--------------------------
|
||||||
|
|
||||||
|
The KF_DESTRUCTIVE flag is used to indicate functions calling which is
|
||||||
|
destructive to the system. For example such a call can result in system
|
||||||
|
rebooting or panicking. Due to this additional restrictions apply to these
|
||||||
|
calls. At the moment they only require CAP_SYS_BOOT capability, but more can be
|
||||||
|
added later.
|
||||||
|
|
||||||
2.5 Registering the kfuncs
|
2.5 Registering the kfuncs
|
||||||
--------------------------
|
--------------------------
|
||||||
|
|||||||
53
Documentation/bpf/linux-notes.rst
Normal file
53
Documentation/bpf/linux-notes.rst
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
.. contents::
|
||||||
|
.. sectnum::
|
||||||
|
|
||||||
|
==========================
|
||||||
|
Linux implementation notes
|
||||||
|
==========================
|
||||||
|
|
||||||
|
This document provides more details specific to the Linux kernel implementation of the eBPF instruction set.
|
||||||
|
|
||||||
|
Byte swap instructions
|
||||||
|
======================
|
||||||
|
|
||||||
|
``BPF_FROM_LE`` and ``BPF_FROM_BE`` exist as aliases for ``BPF_TO_LE`` and ``BPF_TO_BE`` respectively.
|
||||||
|
|
||||||
|
Legacy BPF Packet access instructions
|
||||||
|
=====================================
|
||||||
|
|
||||||
|
As mentioned in the `ISA standard documentation <instruction-set.rst#legacy-bpf-packet-access-instructions>`_,
|
||||||
|
Linux has special eBPF instructions for access to packet data that have been
|
||||||
|
carried over from classic BPF to retain the performance of legacy socket
|
||||||
|
filters running in the eBPF interpreter.
|
||||||
|
|
||||||
|
The instructions come in two forms: ``BPF_ABS | <size> | BPF_LD`` and
|
||||||
|
``BPF_IND | <size> | BPF_LD``.
|
||||||
|
|
||||||
|
These instructions are used to access packet data and can only be used when
|
||||||
|
the program context is a pointer to a networking packet. ``BPF_ABS``
|
||||||
|
accesses packet data at an absolute offset specified by the immediate data
|
||||||
|
and ``BPF_IND`` access packet data at an offset that includes the value of
|
||||||
|
a register in addition to the immediate data.
|
||||||
|
|
||||||
|
These instructions have seven implicit operands:
|
||||||
|
|
||||||
|
* Register R6 is an implicit input that must contain a pointer to a
|
||||||
|
struct sk_buff.
|
||||||
|
* Register R0 is an implicit output which contains the data fetched from
|
||||||
|
the packet.
|
||||||
|
* Registers R1-R5 are scratch registers that are clobbered by the
|
||||||
|
instruction.
|
||||||
|
|
||||||
|
These instructions have an implicit program exit condition as well. If an
|
||||||
|
eBPF program attempts access data beyond the packet boundary, the
|
||||||
|
program execution will be aborted.
|
||||||
|
|
||||||
|
``BPF_ABS | BPF_W | BPF_LD`` (0x20) means::
|
||||||
|
|
||||||
|
R0 = ntohl(*(u32 *) ((struct sk_buff *) R6->data + imm))
|
||||||
|
|
||||||
|
where ``ntohl()`` converts a 32-bit value from network byte order to host byte order.
|
||||||
|
|
||||||
|
``BPF_IND | BPF_W | BPF_LD`` (0x40) means::
|
||||||
|
|
||||||
|
R0 = ntohl(*(u32 *) ((struct sk_buff *) R6->data + src + imm))
|
||||||
@@ -31,7 +31,7 @@ The map uses key of type of either ``__u64 cgroup_inode_id`` or
|
|||||||
};
|
};
|
||||||
|
|
||||||
``cgroup_inode_id`` is the inode id of the cgroup directory.
|
``cgroup_inode_id`` is the inode id of the cgroup directory.
|
||||||
``attach_type`` is the the program's attach type.
|
``attach_type`` is the program's attach type.
|
||||||
|
|
||||||
Linux 5.9 added support for type ``__u64 cgroup_inode_id`` as the key type.
|
Linux 5.9 added support for type ``__u64 cgroup_inode_id`` as the key type.
|
||||||
When this key type is used, then all attach types of the particular cgroup and
|
When this key type is used, then all attach types of the particular cgroup and
|
||||||
@@ -155,7 +155,7 @@ However, the BPF program can still only associate with one map of each type
|
|||||||
``BPF_MAP_TYPE_CGROUP_STORAGE`` or more than one
|
``BPF_MAP_TYPE_CGROUP_STORAGE`` or more than one
|
||||||
``BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE``.
|
``BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE``.
|
||||||
|
|
||||||
In all versions, userspace may use the the attach parameters of cgroup and
|
In all versions, userspace may use the attach parameters of cgroup and
|
||||||
attach type pair in ``struct bpf_cgroup_storage_key`` as the key to the BPF map
|
attach type pair in ``struct bpf_cgroup_storage_key`` as the key to the BPF map
|
||||||
APIs to read or update the storage for a given attachment. For Linux 5.9
|
APIs to read or update the storage for a given attachment. For Linux 5.9
|
||||||
attach type shared storages, only the first value in the struct, cgroup inode
|
attach type shared storages, only the first value in the struct, cgroup inode
|
||||||
|
|||||||
@@ -15,6 +15,18 @@
|
|||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
import sphinx
|
import sphinx
|
||||||
|
import shutil
|
||||||
|
|
||||||
|
# helper
|
||||||
|
# ------
|
||||||
|
|
||||||
|
def have_command(cmd):
|
||||||
|
"""Search ``cmd`` in the ``PATH`` environment.
|
||||||
|
|
||||||
|
If found, return True.
|
||||||
|
If not found, return False.
|
||||||
|
"""
|
||||||
|
return shutil.which(cmd) is not None
|
||||||
|
|
||||||
# Get Sphinx version
|
# Get Sphinx version
|
||||||
major, minor, patch = sphinx.version_info[:3]
|
major, minor, patch = sphinx.version_info[:3]
|
||||||
@@ -86,6 +98,7 @@ if major >= 3:
|
|||||||
"__used",
|
"__used",
|
||||||
"__weak",
|
"__weak",
|
||||||
"noinline",
|
"noinline",
|
||||||
|
"__fix_address",
|
||||||
|
|
||||||
# include/linux/memblock.h:
|
# include/linux/memblock.h:
|
||||||
"__init_memblock",
|
"__init_memblock",
|
||||||
@@ -106,7 +119,32 @@ else:
|
|||||||
autosectionlabel_prefix_document = True
|
autosectionlabel_prefix_document = True
|
||||||
autosectionlabel_maxdepth = 2
|
autosectionlabel_maxdepth = 2
|
||||||
|
|
||||||
|
# Load math renderer:
|
||||||
|
# For html builder, load imgmath only when its dependencies are met.
|
||||||
|
# mathjax is the default math renderer since Sphinx 1.8.
|
||||||
|
have_latex = have_command('latex')
|
||||||
|
have_dvipng = have_command('dvipng')
|
||||||
|
load_imgmath = have_latex and have_dvipng
|
||||||
|
|
||||||
|
# Respect SPHINX_IMGMATH (for html docs only)
|
||||||
|
if 'SPHINX_IMGMATH' in os.environ:
|
||||||
|
env_sphinx_imgmath = os.environ['SPHINX_IMGMATH']
|
||||||
|
if 'yes' in env_sphinx_imgmath:
|
||||||
|
load_imgmath = True
|
||||||
|
elif 'no' in env_sphinx_imgmath:
|
||||||
|
load_imgmath = False
|
||||||
|
else:
|
||||||
|
sys.stderr.write("Unknown env SPHINX_IMGMATH=%s ignored.\n" % env_sphinx_imgmath)
|
||||||
|
|
||||||
|
# Always load imgmath for Sphinx <1.8 or for epub docs
|
||||||
|
load_imgmath = (load_imgmath or (major == 1 and minor < 8)
|
||||||
|
or 'epub' in sys.argv)
|
||||||
|
|
||||||
|
if load_imgmath:
|
||||||
extensions.append("sphinx.ext.imgmath")
|
extensions.append("sphinx.ext.imgmath")
|
||||||
|
math_renderer = 'imgmath'
|
||||||
|
else:
|
||||||
|
math_renderer = 'mathjax'
|
||||||
|
|
||||||
# Add any paths that contain templates here, relative to this directory.
|
# Add any paths that contain templates here, relative to this directory.
|
||||||
templates_path = ['_templates']
|
templates_path = ['_templates']
|
||||||
@@ -332,7 +370,8 @@ html_static_path = ['sphinx-static']
|
|||||||
html_use_smartypants = False
|
html_use_smartypants = False
|
||||||
|
|
||||||
# Custom sidebar templates, maps document names to template names.
|
# Custom sidebar templates, maps document names to template names.
|
||||||
#html_sidebars = {}
|
# Note that the RTD theme ignores this.
|
||||||
|
html_sidebars = { '**': ['searchbox.html', 'localtoc.html', 'sourcelink.html']}
|
||||||
|
|
||||||
# Additional templates that should be rendered to pages, maps page names to
|
# Additional templates that should be rendered to pages, maps page names to
|
||||||
# template names.
|
# template names.
|
||||||
|
|||||||
@@ -43,10 +43,11 @@ annotated objects like this, tools can be run on them to generate more useful
|
|||||||
information. In particular, on properly annotated objects, ``objtool`` can be
|
information. In particular, on properly annotated objects, ``objtool`` can be
|
||||||
run to check and fix the object if needed. Currently, ``objtool`` can report
|
run to check and fix the object if needed. Currently, ``objtool`` can report
|
||||||
missing frame pointer setup/destruction in functions. It can also
|
missing frame pointer setup/destruction in functions. It can also
|
||||||
automatically generate annotations for :doc:`ORC unwinder <x86/orc-unwinder>`
|
automatically generate annotations for the ORC unwinder
|
||||||
|
(Documentation/x86/orc-unwinder.rst)
|
||||||
for most code. Both of these are especially important to support reliable
|
for most code. Both of these are especially important to support reliable
|
||||||
stack traces which are in turn necessary for :doc:`Kernel live patching
|
stack traces which are in turn necessary for kernel live patching
|
||||||
<livepatch/livepatch>`.
|
(Documentation/livepatch/livepatch.rst).
|
||||||
|
|
||||||
Caveat and Discussion
|
Caveat and Discussion
|
||||||
---------------------
|
---------------------
|
||||||
@@ -560,7 +560,7 @@ available:
|
|||||||
* cpuhp_state_remove_instance(state, node)
|
* cpuhp_state_remove_instance(state, node)
|
||||||
* cpuhp_state_remove_instance_nocalls(state, node)
|
* cpuhp_state_remove_instance_nocalls(state, node)
|
||||||
|
|
||||||
The arguments are the same as for the the cpuhp_state_add_instance*()
|
The arguments are the same as for the cpuhp_state_add_instance*()
|
||||||
variants above.
|
variants above.
|
||||||
|
|
||||||
The functions differ in the way how the installed callbacks are treated:
|
The functions differ in the way how the installed callbacks are treated:
|
||||||
|
|||||||
@@ -23,6 +23,7 @@ it.
|
|||||||
printk-formats
|
printk-formats
|
||||||
printk-index
|
printk-index
|
||||||
symbol-namespaces
|
symbol-namespaces
|
||||||
|
asm-annotations
|
||||||
|
|
||||||
Data structures and low-level utilities
|
Data structures and low-level utilities
|
||||||
=======================================
|
=======================================
|
||||||
@@ -36,6 +37,7 @@ Library functionality that is used throughout the kernel.
|
|||||||
kref
|
kref
|
||||||
assoc_array
|
assoc_array
|
||||||
xarray
|
xarray
|
||||||
|
maple_tree
|
||||||
idr
|
idr
|
||||||
circular-buffers
|
circular-buffers
|
||||||
rbtree
|
rbtree
|
||||||
@@ -44,6 +46,8 @@ Library functionality that is used throughout the kernel.
|
|||||||
this_cpu_ops
|
this_cpu_ops
|
||||||
timekeeping
|
timekeeping
|
||||||
errseq
|
errseq
|
||||||
|
wrappers/atomic_t
|
||||||
|
wrappers/atomic_bitops
|
||||||
|
|
||||||
Low level entry and exit
|
Low level entry and exit
|
||||||
========================
|
========================
|
||||||
@@ -67,6 +71,7 @@ Documentation/locking/index.rst for more related documentation.
|
|||||||
local_ops
|
local_ops
|
||||||
padata
|
padata
|
||||||
../RCU/index
|
../RCU/index
|
||||||
|
wrappers/memory-barriers.rst
|
||||||
|
|
||||||
Low-level hardware management
|
Low-level hardware management
|
||||||
=============================
|
=============================
|
||||||
|
|||||||
217
Documentation/core-api/maple_tree.rst
Normal file
217
Documentation/core-api/maple_tree.rst
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0+
|
||||||
|
|
||||||
|
|
||||||
|
==========
|
||||||
|
Maple Tree
|
||||||
|
==========
|
||||||
|
|
||||||
|
:Author: Liam R. Howlett
|
||||||
|
|
||||||
|
Overview
|
||||||
|
========
|
||||||
|
|
||||||
|
The Maple Tree is a B-Tree data type which is optimized for storing
|
||||||
|
non-overlapping ranges, including ranges of size 1. The tree was designed to
|
||||||
|
be simple to use and does not require a user written search method. It
|
||||||
|
supports iterating over a range of entries and going to the previous or next
|
||||||
|
entry in a cache-efficient manner. The tree can also be put into an RCU-safe
|
||||||
|
mode of operation which allows reading and writing concurrently. Writers must
|
||||||
|
synchronize on a lock, which can be the default spinlock, or the user can set
|
||||||
|
the lock to an external lock of a different type.
|
||||||
|
|
||||||
|
The Maple Tree maintains a small memory footprint and was designed to use
|
||||||
|
modern processor cache efficiently. The majority of the users will be able to
|
||||||
|
use the normal API. An :ref:`maple-tree-advanced-api` exists for more complex
|
||||||
|
scenarios. The most important usage of the Maple Tree is the tracking of the
|
||||||
|
virtual memory areas.
|
||||||
|
|
||||||
|
The Maple Tree can store values between ``0`` and ``ULONG_MAX``. The Maple
|
||||||
|
Tree reserves values with the bottom two bits set to '10' which are below 4096
|
||||||
|
(ie 2, 6, 10 .. 4094) for internal use. If the entries may use reserved
|
||||||
|
entries then the users can convert the entries using xa_mk_value() and convert
|
||||||
|
them back by calling xa_to_value(). If the user needs to use a reserved
|
||||||
|
value, then the user can convert the value when using the
|
||||||
|
:ref:`maple-tree-advanced-api`, but are blocked by the normal API.
|
||||||
|
|
||||||
|
The Maple Tree can also be configured to support searching for a gap of a given
|
||||||
|
size (or larger).
|
||||||
|
|
||||||
|
Pre-allocating of nodes is also supported using the
|
||||||
|
:ref:`maple-tree-advanced-api`. This is useful for users who must guarantee a
|
||||||
|
successful store operation within a given
|
||||||
|
code segment when allocating cannot be done. Allocations of nodes are
|
||||||
|
relatively small at around 256 bytes.
|
||||||
|
|
||||||
|
.. _maple-tree-normal-api:
|
||||||
|
|
||||||
|
Normal API
|
||||||
|
==========
|
||||||
|
|
||||||
|
Start by initialising a maple tree, either with DEFINE_MTREE() for statically
|
||||||
|
allocated maple trees or mt_init() for dynamically allocated ones. A
|
||||||
|
freshly-initialised maple tree contains a ``NULL`` pointer for the range ``0``
|
||||||
|
- ``ULONG_MAX``. There are currently two types of maple trees supported: the
|
||||||
|
allocation tree and the regular tree. The regular tree has a higher branching
|
||||||
|
factor for internal nodes. The allocation tree has a lower branching factor
|
||||||
|
but allows the user to search for a gap of a given size or larger from either
|
||||||
|
``0`` upwards or ``ULONG_MAX`` down. An allocation tree can be used by
|
||||||
|
passing in the ``MT_FLAGS_ALLOC_RANGE`` flag when initialising the tree.
|
||||||
|
|
||||||
|
You can then set entries using mtree_store() or mtree_store_range().
|
||||||
|
mtree_store() will overwrite any entry with the new entry and return 0 on
|
||||||
|
success or an error code otherwise. mtree_store_range() works in the same way
|
||||||
|
but takes a range. mtree_load() is used to retrieve the entry stored at a
|
||||||
|
given index. You can use mtree_erase() to erase an entire range by only
|
||||||
|
knowing one value within that range, or mtree_store() call with an entry of
|
||||||
|
NULL may be used to partially erase a range or many ranges at once.
|
||||||
|
|
||||||
|
If you want to only store a new entry to a range (or index) if that range is
|
||||||
|
currently ``NULL``, you can use mtree_insert_range() or mtree_insert() which
|
||||||
|
return -EEXIST if the range is not empty.
|
||||||
|
|
||||||
|
You can search for an entry from an index upwards by using mt_find().
|
||||||
|
|
||||||
|
You can walk each entry within a range by calling mt_for_each(). You must
|
||||||
|
provide a temporary variable to store a cursor. If you want to walk each
|
||||||
|
element of the tree then ``0`` and ``ULONG_MAX`` may be used as the range. If
|
||||||
|
the caller is going to hold the lock for the duration of the walk then it is
|
||||||
|
worth looking at the mas_for_each() API in the :ref:`maple-tree-advanced-api`
|
||||||
|
section.
|
||||||
|
|
||||||
|
Sometimes it is necessary to ensure the next call to store to a maple tree does
|
||||||
|
not allocate memory, please see :ref:`maple-tree-advanced-api` for this use case.
|
||||||
|
|
||||||
|
Finally, you can remove all entries from a maple tree by calling
|
||||||
|
mtree_destroy(). If the maple tree entries are pointers, you may wish to free
|
||||||
|
the entries first.
|
||||||
|
|
||||||
|
Allocating Nodes
|
||||||
|
----------------
|
||||||
|
|
||||||
|
The allocations are handled by the internal tree code. See
|
||||||
|
:ref:`maple-tree-advanced-alloc` for other options.
|
||||||
|
|
||||||
|
Locking
|
||||||
|
-------
|
||||||
|
|
||||||
|
You do not have to worry about locking. See :ref:`maple-tree-advanced-locks`
|
||||||
|
for other options.
|
||||||
|
|
||||||
|
The Maple Tree uses RCU and an internal spinlock to synchronise access:
|
||||||
|
|
||||||
|
Takes RCU read lock:
|
||||||
|
* mtree_load()
|
||||||
|
* mt_find()
|
||||||
|
* mt_for_each()
|
||||||
|
* mt_next()
|
||||||
|
* mt_prev()
|
||||||
|
|
||||||
|
Takes ma_lock internally:
|
||||||
|
* mtree_store()
|
||||||
|
* mtree_store_range()
|
||||||
|
* mtree_insert()
|
||||||
|
* mtree_insert_range()
|
||||||
|
* mtree_erase()
|
||||||
|
* mtree_destroy()
|
||||||
|
* mt_set_in_rcu()
|
||||||
|
* mt_clear_in_rcu()
|
||||||
|
|
||||||
|
If you want to take advantage of the internal lock to protect the data
|
||||||
|
structures that you are storing in the Maple Tree, you can call mtree_lock()
|
||||||
|
before calling mtree_load(), then take a reference count on the object you
|
||||||
|
have found before calling mtree_unlock(). This will prevent stores from
|
||||||
|
removing the object from the tree between looking up the object and
|
||||||
|
incrementing the refcount. You can also use RCU to avoid dereferencing
|
||||||
|
freed memory, but an explanation of that is beyond the scope of this
|
||||||
|
document.
|
||||||
|
|
||||||
|
.. _maple-tree-advanced-api:
|
||||||
|
|
||||||
|
Advanced API
|
||||||
|
============
|
||||||
|
|
||||||
|
The advanced API offers more flexibility and better performance at the
|
||||||
|
cost of an interface which can be harder to use and has fewer safeguards.
|
||||||
|
You must take care of your own locking while using the advanced API.
|
||||||
|
You can use the ma_lock, RCU or an external lock for protection.
|
||||||
|
You can mix advanced and normal operations on the same array, as long
|
||||||
|
as the locking is compatible. The :ref:`maple-tree-normal-api` is implemented
|
||||||
|
in terms of the advanced API.
|
||||||
|
|
||||||
|
The advanced API is based around the ma_state, this is where the 'mas'
|
||||||
|
prefix originates. The ma_state struct keeps track of tree operations to make
|
||||||
|
life easier for both internal and external tree users.
|
||||||
|
|
||||||
|
Initialising the maple tree is the same as in the :ref:`maple-tree-normal-api`.
|
||||||
|
Please see above.
|
||||||
|
|
||||||
|
The maple state keeps track of the range start and end in mas->index and
|
||||||
|
mas->last, respectively.
|
||||||
|
|
||||||
|
mas_walk() will walk the tree to the location of mas->index and set the
|
||||||
|
mas->index and mas->last according to the range for the entry.
|
||||||
|
|
||||||
|
You can set entries using mas_store(). mas_store() will overwrite any entry
|
||||||
|
with the new entry and return the first existing entry that is overwritten.
|
||||||
|
The range is passed in as members of the maple state: index and last.
|
||||||
|
|
||||||
|
You can use mas_erase() to erase an entire range by setting index and
|
||||||
|
last of the maple state to the desired range to erase. This will erase
|
||||||
|
the first range that is found in that range, set the maple state index
|
||||||
|
and last as the range that was erased and return the entry that existed
|
||||||
|
at that location.
|
||||||
|
|
||||||
|
You can walk each entry within a range by using mas_for_each(). If you want
|
||||||
|
to walk each element of the tree then ``0`` and ``ULONG_MAX`` may be used as
|
||||||
|
the range. If the lock needs to be periodically dropped, see the locking
|
||||||
|
section mas_pause().
|
||||||
|
|
||||||
|
Using a maple state allows mas_next() and mas_prev() to function as if the
|
||||||
|
tree was a linked list. With such a high branching factor the amortized
|
||||||
|
performance penalty is outweighed by cache optimization. mas_next() will
|
||||||
|
return the next entry which occurs after the entry at index. mas_prev()
|
||||||
|
will return the previous entry which occurs before the entry at index.
|
||||||
|
|
||||||
|
mas_find() will find the first entry which exists at or above index on
|
||||||
|
the first call, and the next entry from every subsequent calls.
|
||||||
|
|
||||||
|
mas_find_rev() will find the fist entry which exists at or below the last on
|
||||||
|
the first call, and the previous entry from every subsequent calls.
|
||||||
|
|
||||||
|
If the user needs to yield the lock during an operation, then the maple state
|
||||||
|
must be paused using mas_pause().
|
||||||
|
|
||||||
|
There are a few extra interfaces provided when using an allocation tree.
|
||||||
|
If you wish to search for a gap within a range, then mas_empty_area()
|
||||||
|
or mas_empty_area_rev() can be used. mas_empty_area() searches for a gap
|
||||||
|
starting at the lowest index given up to the maximum of the range.
|
||||||
|
mas_empty_area_rev() searches for a gap starting at the highest index given
|
||||||
|
and continues downward to the lower bound of the range.
|
||||||
|
|
||||||
|
.. _maple-tree-advanced-alloc:
|
||||||
|
|
||||||
|
Advanced Allocating Nodes
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
Allocations are usually handled internally to the tree, however if allocations
|
||||||
|
need to occur before a write occurs then calling mas_expected_entries() will
|
||||||
|
allocate the worst-case number of needed nodes to insert the provided number of
|
||||||
|
ranges. This also causes the tree to enter mass insertion mode. Once
|
||||||
|
insertions are complete calling mas_destroy() on the maple state will free the
|
||||||
|
unused allocations.
|
||||||
|
|
||||||
|
.. _maple-tree-advanced-locks:
|
||||||
|
|
||||||
|
Advanced Locking
|
||||||
|
----------------
|
||||||
|
|
||||||
|
The maple tree uses a spinlock by default, but external locks can be used for
|
||||||
|
tree updates as well. To use an external lock, the tree must be initialized
|
||||||
|
with the ``MT_FLAGS_LOCK_EXTERN flag``, this is usually done with the
|
||||||
|
MTREE_INIT_EXT() #define, which takes an external lock as an argument.
|
||||||
|
|
||||||
|
Functions and structures
|
||||||
|
========================
|
||||||
|
|
||||||
|
.. kernel-doc:: include/linux/maple_tree.h
|
||||||
|
.. kernel-doc:: lib/maple_tree.c
|
||||||
@@ -19,9 +19,6 @@ User Space Memory Access
|
|||||||
Memory Allocation Controls
|
Memory Allocation Controls
|
||||||
==========================
|
==========================
|
||||||
|
|
||||||
.. kernel-doc:: include/linux/gfp.h
|
|
||||||
:internal:
|
|
||||||
|
|
||||||
.. kernel-doc:: include/linux/gfp_types.h
|
.. kernel-doc:: include/linux/gfp_types.h
|
||||||
:doc: Page mobility and placement hints
|
:doc: Page mobility and placement hints
|
||||||
|
|
||||||
|
|||||||
@@ -625,6 +625,16 @@ Examples::
|
|||||||
%p4cc Y10 little-endian (0x20303159)
|
%p4cc Y10 little-endian (0x20303159)
|
||||||
%p4cc NV12 big-endian (0xb231564e)
|
%p4cc NV12 big-endian (0xb231564e)
|
||||||
|
|
||||||
|
Rust
|
||||||
|
----
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
%pA
|
||||||
|
|
||||||
|
Only intended to be used from Rust code to format ``core::fmt::Arguments``.
|
||||||
|
Do *not* use it from C.
|
||||||
|
|
||||||
Thanks
|
Thanks
|
||||||
======
|
======
|
||||||
|
|
||||||
|
|||||||
18
Documentation/core-api/wrappers/atomic_bitops.rst
Normal file
18
Documentation/core-api/wrappers/atomic_bitops.rst
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
This is a simple wrapper to bring atomic_bitops.txt into the RST world
|
||||||
|
until such a time as that file can be converted directly.
|
||||||
|
|
||||||
|
=============
|
||||||
|
Atomic bitops
|
||||||
|
=============
|
||||||
|
|
||||||
|
.. raw:: latex
|
||||||
|
|
||||||
|
\footnotesize
|
||||||
|
|
||||||
|
.. include:: ../../atomic_bitops.txt
|
||||||
|
:literal:
|
||||||
|
|
||||||
|
.. raw:: latex
|
||||||
|
|
||||||
|
\normalsize
|
||||||
19
Documentation/core-api/wrappers/atomic_t.rst
Normal file
19
Documentation/core-api/wrappers/atomic_t.rst
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
This is a simple wrapper to bring atomic_t.txt into the RST world
|
||||||
|
until such a time as that file can be converted directly.
|
||||||
|
|
||||||
|
============
|
||||||
|
Atomic types
|
||||||
|
============
|
||||||
|
|
||||||
|
.. raw:: latex
|
||||||
|
|
||||||
|
\footnotesize
|
||||||
|
|
||||||
|
.. include:: ../../atomic_t.txt
|
||||||
|
:literal:
|
||||||
|
|
||||||
|
.. raw:: latex
|
||||||
|
|
||||||
|
\normalsize
|
||||||
|
|
||||||
18
Documentation/core-api/wrappers/memory-barriers.rst
Normal file
18
Documentation/core-api/wrappers/memory-barriers.rst
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
This is a simple wrapper to bring memory-barriers.txt into the RST world
|
||||||
|
until such a time as that file can be converted directly.
|
||||||
|
|
||||||
|
============================
|
||||||
|
Linux kernel memory barriers
|
||||||
|
============================
|
||||||
|
|
||||||
|
.. raw:: latex
|
||||||
|
|
||||||
|
\footnotesize
|
||||||
|
|
||||||
|
.. include:: ../../memory-barriers.txt
|
||||||
|
:literal:
|
||||||
|
|
||||||
|
.. raw:: latex
|
||||||
|
|
||||||
|
\normalsize
|
||||||
@@ -612,6 +612,13 @@ Commit message
|
|||||||
|
|
||||||
See: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes
|
See: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes
|
||||||
|
|
||||||
|
**BAD_FIXES_TAG**
|
||||||
|
The Fixes: tag is malformed or does not follow the community conventions.
|
||||||
|
This can occur if the tag have been split into multiple lines (e.g., when
|
||||||
|
pasted in an email program with word wrapping enabled).
|
||||||
|
|
||||||
|
See: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes
|
||||||
|
|
||||||
|
|
||||||
Comparison style
|
Comparison style
|
||||||
----------------
|
----------------
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ Documentation/dev-tools/testing-overview.rst
|
|||||||
kcov
|
kcov
|
||||||
gcov
|
gcov
|
||||||
kasan
|
kasan
|
||||||
|
kmsan
|
||||||
ubsan
|
ubsan
|
||||||
kmemleak
|
kmemleak
|
||||||
kcsan
|
kcsan
|
||||||
|
|||||||
@@ -111,9 +111,17 @@ parameter can be used to control panic and reporting behaviour:
|
|||||||
report or also panic the kernel (default: ``report``). The panic happens even
|
report or also panic the kernel (default: ``report``). The panic happens even
|
||||||
if ``kasan_multi_shot`` is enabled.
|
if ``kasan_multi_shot`` is enabled.
|
||||||
|
|
||||||
Hardware Tag-Based KASAN mode (see the section about various modes below) is
|
Software and Hardware Tag-Based KASAN modes (see the section about various
|
||||||
intended for use in production as a security mitigation. Therefore, it supports
|
modes below) support altering stack trace collection behavior:
|
||||||
additional boot parameters that allow disabling KASAN or controlling features:
|
|
||||||
|
- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
|
||||||
|
traces collection (default: ``on``).
|
||||||
|
- ``kasan.stack_ring_size=<number of entries>`` specifies the number of entries
|
||||||
|
in the stack ring (default: ``32768``).
|
||||||
|
|
||||||
|
Hardware Tag-Based KASAN mode is intended for use in production as a security
|
||||||
|
mitigation. Therefore, it supports additional boot parameters that allow
|
||||||
|
disabling KASAN altogether or controlling its features:
|
||||||
|
|
||||||
- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
|
- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
|
||||||
|
|
||||||
@@ -132,9 +140,6 @@ additional boot parameters that allow disabling KASAN or controlling features:
|
|||||||
- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc
|
- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc
|
||||||
allocations (default: ``on``).
|
allocations (default: ``on``).
|
||||||
|
|
||||||
- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
|
|
||||||
traces collection (default: ``on``).
|
|
||||||
|
|
||||||
Error reports
|
Error reports
|
||||||
~~~~~~~~~~~~~
|
~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
|||||||
427
Documentation/dev-tools/kmsan.rst
Normal file
427
Documentation/dev-tools/kmsan.rst
Normal file
@@ -0,0 +1,427 @@
|
|||||||
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
.. Copyright (C) 2022, Google LLC.
|
||||||
|
|
||||||
|
===================================
|
||||||
|
The Kernel Memory Sanitizer (KMSAN)
|
||||||
|
===================================
|
||||||
|
|
||||||
|
KMSAN is a dynamic error detector aimed at finding uses of uninitialized
|
||||||
|
values. It is based on compiler instrumentation, and is quite similar to the
|
||||||
|
userspace `MemorySanitizer tool`_.
|
||||||
|
|
||||||
|
An important note is that KMSAN is not intended for production use, because it
|
||||||
|
drastically increases kernel memory footprint and slows the whole system down.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
=====
|
||||||
|
|
||||||
|
Building the kernel
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
In order to build a kernel with KMSAN you will need a fresh Clang (14.0.6+).
|
||||||
|
Please refer to `LLVM documentation`_ for the instructions on how to build Clang.
|
||||||
|
|
||||||
|
Now configure and build the kernel with CONFIG_KMSAN enabled.
|
||||||
|
|
||||||
|
Example report
|
||||||
|
--------------
|
||||||
|
|
||||||
|
Here is an example of a KMSAN report::
|
||||||
|
|
||||||
|
=====================================================
|
||||||
|
BUG: KMSAN: uninit-value in test_uninit_kmsan_check_memory+0x1be/0x380 [kmsan_test]
|
||||||
|
test_uninit_kmsan_check_memory+0x1be/0x380 mm/kmsan/kmsan_test.c:273
|
||||||
|
kunit_run_case_internal lib/kunit/test.c:333
|
||||||
|
kunit_try_run_case+0x206/0x420 lib/kunit/test.c:374
|
||||||
|
kunit_generic_run_threadfn_adapter+0x6d/0xc0 lib/kunit/try-catch.c:28
|
||||||
|
kthread+0x721/0x850 kernel/kthread.c:327
|
||||||
|
ret_from_fork+0x1f/0x30 ??:?
|
||||||
|
|
||||||
|
Uninit was stored to memory at:
|
||||||
|
do_uninit_local_array+0xfa/0x110 mm/kmsan/kmsan_test.c:260
|
||||||
|
test_uninit_kmsan_check_memory+0x1a2/0x380 mm/kmsan/kmsan_test.c:271
|
||||||
|
kunit_run_case_internal lib/kunit/test.c:333
|
||||||
|
kunit_try_run_case+0x206/0x420 lib/kunit/test.c:374
|
||||||
|
kunit_generic_run_threadfn_adapter+0x6d/0xc0 lib/kunit/try-catch.c:28
|
||||||
|
kthread+0x721/0x850 kernel/kthread.c:327
|
||||||
|
ret_from_fork+0x1f/0x30 ??:?
|
||||||
|
|
||||||
|
Local variable uninit created at:
|
||||||
|
do_uninit_local_array+0x4a/0x110 mm/kmsan/kmsan_test.c:256
|
||||||
|
test_uninit_kmsan_check_memory+0x1a2/0x380 mm/kmsan/kmsan_test.c:271
|
||||||
|
|
||||||
|
Bytes 4-7 of 8 are uninitialized
|
||||||
|
Memory access of size 8 starts at ffff888083fe3da0
|
||||||
|
|
||||||
|
CPU: 0 PID: 6731 Comm: kunit_try_catch Tainted: G B E 5.16.0-rc3+ #104
|
||||||
|
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
|
||||||
|
=====================================================
|
||||||
|
|
||||||
|
The report says that the local variable ``uninit`` was created uninitialized in
|
||||||
|
``do_uninit_local_array()``. The third stack trace corresponds to the place
|
||||||
|
where this variable was created.
|
||||||
|
|
||||||
|
The first stack trace shows where the uninit value was used (in
|
||||||
|
``test_uninit_kmsan_check_memory()``). The tool shows the bytes which were left
|
||||||
|
uninitialized in the local variable, as well as the stack where the value was
|
||||||
|
copied to another memory location before use.
|
||||||
|
|
||||||
|
A use of uninitialized value ``v`` is reported by KMSAN in the following cases:
|
||||||
|
- in a condition, e.g. ``if (v) { ... }``;
|
||||||
|
- in an indexing or pointer dereferencing, e.g. ``array[v]`` or ``*v``;
|
||||||
|
- when it is copied to userspace or hardware, e.g. ``copy_to_user(..., &v, ...)``;
|
||||||
|
- when it is passed as an argument to a function, and
|
||||||
|
``CONFIG_KMSAN_CHECK_PARAM_RETVAL`` is enabled (see below).
|
||||||
|
|
||||||
|
The mentioned cases (apart from copying data to userspace or hardware, which is
|
||||||
|
a security issue) are considered undefined behavior from the C11 Standard point
|
||||||
|
of view.
|
||||||
|
|
||||||
|
Disabling the instrumentation
|
||||||
|
-----------------------------
|
||||||
|
|
||||||
|
A function can be marked with ``__no_kmsan_checks``. Doing so makes KMSAN
|
||||||
|
ignore uninitialized values in that function and mark its output as initialized.
|
||||||
|
As a result, the user will not get KMSAN reports related to that function.
|
||||||
|
|
||||||
|
Another function attribute supported by KMSAN is ``__no_sanitize_memory``.
|
||||||
|
Applying this attribute to a function will result in KMSAN not instrumenting
|
||||||
|
it, which can be helpful if we do not want the compiler to interfere with some
|
||||||
|
low-level code (e.g. that marked with ``noinstr`` which implicitly adds
|
||||||
|
``__no_sanitize_memory``).
|
||||||
|
|
||||||
|
This however comes at a cost: stack allocations from such functions will have
|
||||||
|
incorrect shadow/origin values, likely leading to false positives. Functions
|
||||||
|
called from non-instrumented code may also receive incorrect metadata for their
|
||||||
|
parameters.
|
||||||
|
|
||||||
|
As a rule of thumb, avoid using ``__no_sanitize_memory`` explicitly.
|
||||||
|
|
||||||
|
It is also possible to disable KMSAN for a single file (e.g. main.o)::
|
||||||
|
|
||||||
|
KMSAN_SANITIZE_main.o := n
|
||||||
|
|
||||||
|
or for the whole directory::
|
||||||
|
|
||||||
|
KMSAN_SANITIZE := n
|
||||||
|
|
||||||
|
in the Makefile. Think of this as applying ``__no_sanitize_memory`` to every
|
||||||
|
function in the file or directory. Most users won't need KMSAN_SANITIZE, unless
|
||||||
|
their code gets broken by KMSAN (e.g. runs at early boot time).
|
||||||
|
|
||||||
|
Support
|
||||||
|
=======
|
||||||
|
|
||||||
|
In order for KMSAN to work the kernel must be built with Clang, which so far is
|
||||||
|
the only compiler that has KMSAN support. The kernel instrumentation pass is
|
||||||
|
based on the userspace `MemorySanitizer tool`_.
|
||||||
|
|
||||||
|
The runtime library only supports x86_64 at the moment.
|
||||||
|
|
||||||
|
How KMSAN works
|
||||||
|
===============
|
||||||
|
|
||||||
|
KMSAN shadow memory
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
KMSAN associates a metadata byte (also called shadow byte) with every byte of
|
||||||
|
kernel memory. A bit in the shadow byte is set iff the corresponding bit of the
|
||||||
|
kernel memory byte is uninitialized. Marking the memory uninitialized (i.e.
|
||||||
|
setting its shadow bytes to ``0xff``) is called poisoning, marking it
|
||||||
|
initialized (setting the shadow bytes to ``0x00``) is called unpoisoning.
|
||||||
|
|
||||||
|
When a new variable is allocated on the stack, it is poisoned by default by
|
||||||
|
instrumentation code inserted by the compiler (unless it is a stack variable
|
||||||
|
that is immediately initialized). Any new heap allocation done without
|
||||||
|
``__GFP_ZERO`` is also poisoned.
|
||||||
|
|
||||||
|
Compiler instrumentation also tracks the shadow values as they are used along
|
||||||
|
the code. When needed, instrumentation code invokes the runtime library in
|
||||||
|
``mm/kmsan/`` to persist shadow values.
|
||||||
|
|
||||||
|
The shadow value of a basic or compound type is an array of bytes of the same
|
||||||
|
length. When a constant value is written into memory, that memory is unpoisoned.
|
||||||
|
When a value is read from memory, its shadow memory is also obtained and
|
||||||
|
propagated into all the operations which use that value. For every instruction
|
||||||
|
that takes one or more values the compiler generates code that calculates the
|
||||||
|
shadow of the result depending on those values and their shadows.
|
||||||
|
|
||||||
|
Example::
|
||||||
|
|
||||||
|
int a = 0xff; // i.e. 0x000000ff
|
||||||
|
int b;
|
||||||
|
int c = a | b;
|
||||||
|
|
||||||
|
In this case the shadow of ``a`` is ``0``, shadow of ``b`` is ``0xffffffff``,
|
||||||
|
shadow of ``c`` is ``0xffffff00``. This means that the upper three bytes of
|
||||||
|
``c`` are uninitialized, while the lower byte is initialized.
|
||||||
|
|
||||||
|
Origin tracking
|
||||||
|
---------------
|
||||||
|
|
||||||
|
Every four bytes of kernel memory also have a so-called origin mapped to them.
|
||||||
|
This origin describes the point in program execution at which the uninitialized
|
||||||
|
value was created. Every origin is associated with either the full allocation
|
||||||
|
stack (for heap-allocated memory), or the function containing the uninitialized
|
||||||
|
variable (for locals).
|
||||||
|
|
||||||
|
When an uninitialized variable is allocated on stack or heap, a new origin
|
||||||
|
value is created, and that variable's origin is filled with that value. When a
|
||||||
|
value is read from memory, its origin is also read and kept together with the
|
||||||
|
shadow. For every instruction that takes one or more values, the origin of the
|
||||||
|
result is one of the origins corresponding to any of the uninitialized inputs.
|
||||||
|
If a poisoned value is written into memory, its origin is written to the
|
||||||
|
corresponding storage as well.
|
||||||
|
|
||||||
|
Example 1::
|
||||||
|
|
||||||
|
int a = 42;
|
||||||
|
int b;
|
||||||
|
int c = a + b;
|
||||||
|
|
||||||
|
In this case the origin of ``b`` is generated upon function entry, and is
|
||||||
|
stored to the origin of ``c`` right before the addition result is written into
|
||||||
|
memory.
|
||||||
|
|
||||||
|
Several variables may share the same origin address, if they are stored in the
|
||||||
|
same four-byte chunk. In this case every write to either variable updates the
|
||||||
|
origin for all of them. We have to sacrifice precision in this case, because
|
||||||
|
storing origins for individual bits (and even bytes) would be too costly.
|
||||||
|
|
||||||
|
Example 2::
|
||||||
|
|
||||||
|
int combine(short a, short b) {
|
||||||
|
union ret_t {
|
||||||
|
int i;
|
||||||
|
short s[2];
|
||||||
|
} ret;
|
||||||
|
ret.s[0] = a;
|
||||||
|
ret.s[1] = b;
|
||||||
|
return ret.i;
|
||||||
|
}
|
||||||
|
|
||||||
|
If ``a`` is initialized and ``b`` is not, the shadow of the result would be
|
||||||
|
0xffff0000, and the origin of the result would be the origin of ``b``.
|
||||||
|
``ret.s[0]`` would have the same origin, but it will never be used, because
|
||||||
|
that variable is initialized.
|
||||||
|
|
||||||
|
If both function arguments are uninitialized, only the origin of the second
|
||||||
|
argument is preserved.
|
||||||
|
|
||||||
|
Origin chaining
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
To ease debugging, KMSAN creates a new origin for every store of an
|
||||||
|
uninitialized value to memory. The new origin references both its creation stack
|
||||||
|
and the previous origin the value had. This may cause increased memory
|
||||||
|
consumption, so we limit the length of origin chains in the runtime.
|
||||||
|
|
||||||
|
Clang instrumentation API
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
Clang instrumentation pass inserts calls to functions defined in
|
||||||
|
``mm/kmsan/nstrumentation.c`` into the kernel code.
|
||||||
|
|
||||||
|
Shadow manipulation
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
For every memory access the compiler emits a call to a function that returns a
|
||||||
|
pair of pointers to the shadow and origin addresses of the given memory::
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
void *shadow, *origin;
|
||||||
|
} shadow_origin_ptr_t
|
||||||
|
|
||||||
|
shadow_origin_ptr_t __msan_metadata_ptr_for_load_{1,2,4,8}(void *addr)
|
||||||
|
shadow_origin_ptr_t __msan_metadata_ptr_for_store_{1,2,4,8}(void *addr)
|
||||||
|
shadow_origin_ptr_t __msan_metadata_ptr_for_load_n(void *addr, uintptr_t size)
|
||||||
|
shadow_origin_ptr_t __msan_metadata_ptr_for_store_n(void *addr, uintptr_t size)
|
||||||
|
|
||||||
|
The function name depends on the memory access size.
|
||||||
|
|
||||||
|
The compiler makes sure that for every loaded value its shadow and origin
|
||||||
|
values are read from memory. When a value is stored to memory, its shadow and
|
||||||
|
origin are also stored using the metadata pointers.
|
||||||
|
|
||||||
|
Handling locals
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
A special function is used to create a new origin value for a local variable and
|
||||||
|
set the origin of that variable to that value::
|
||||||
|
|
||||||
|
void __msan_poison_alloca(void *addr, uintptr_t size, char *descr)
|
||||||
|
|
||||||
|
Access to per-task data
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
At the beginning of every instrumented function KMSAN inserts a call to
|
||||||
|
``__msan_get_context_state()``::
|
||||||
|
|
||||||
|
kmsan_context_state *__msan_get_context_state(void)
|
||||||
|
|
||||||
|
``kmsan_context_state`` is declared in ``include/linux/kmsan.h``::
|
||||||
|
|
||||||
|
struct kmsan_context_state {
|
||||||
|
char param_tls[KMSAN_PARAM_SIZE];
|
||||||
|
char retval_tls[KMSAN_RETVAL_SIZE];
|
||||||
|
char va_arg_tls[KMSAN_PARAM_SIZE];
|
||||||
|
char va_arg_origin_tls[KMSAN_PARAM_SIZE];
|
||||||
|
u64 va_arg_overflow_size_tls;
|
||||||
|
char param_origin_tls[KMSAN_PARAM_SIZE];
|
||||||
|
depot_stack_handle_t retval_origin_tls;
|
||||||
|
};
|
||||||
|
|
||||||
|
This structure is used by KMSAN to pass parameter shadows and origins between
|
||||||
|
instrumented functions (unless the parameters are checked immediately by
|
||||||
|
``CONFIG_KMSAN_CHECK_PARAM_RETVAL``).
|
||||||
|
|
||||||
|
Passing uninitialized values to functions
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Clang's MemorySanitizer instrumentation has an option,
|
||||||
|
``-fsanitize-memory-param-retval``, which makes the compiler check function
|
||||||
|
parameters passed by value, as well as function return values.
|
||||||
|
|
||||||
|
The option is controlled by ``CONFIG_KMSAN_CHECK_PARAM_RETVAL``, which is
|
||||||
|
enabled by default to let KMSAN report uninitialized values earlier.
|
||||||
|
Please refer to the `LKML discussion`_ for more details.
|
||||||
|
|
||||||
|
Because of the way the checks are implemented in LLVM (they are only applied to
|
||||||
|
parameters marked as ``noundef``), not all parameters are guaranteed to be
|
||||||
|
checked, so we cannot give up the metadata storage in ``kmsan_context_state``.
|
||||||
|
|
||||||
|
String functions
|
||||||
|
~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The compiler replaces calls to ``memcpy()``/``memmove()``/``memset()`` with the
|
||||||
|
following functions. These functions are also called when data structures are
|
||||||
|
initialized or copied, making sure shadow and origin values are copied alongside
|
||||||
|
with the data::
|
||||||
|
|
||||||
|
void *__msan_memcpy(void *dst, void *src, uintptr_t n)
|
||||||
|
void *__msan_memmove(void *dst, void *src, uintptr_t n)
|
||||||
|
void *__msan_memset(void *dst, int c, uintptr_t n)
|
||||||
|
|
||||||
|
Error reporting
|
||||||
|
~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
For each use of a value the compiler emits a shadow check that calls
|
||||||
|
``__msan_warning()`` in the case that value is poisoned::
|
||||||
|
|
||||||
|
void __msan_warning(u32 origin)
|
||||||
|
|
||||||
|
``__msan_warning()`` causes KMSAN runtime to print an error report.
|
||||||
|
|
||||||
|
Inline assembly instrumentation
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
KMSAN instruments every inline assembly output with a call to::
|
||||||
|
|
||||||
|
void __msan_instrument_asm_store(void *addr, uintptr_t size)
|
||||||
|
|
||||||
|
, which unpoisons the memory region.
|
||||||
|
|
||||||
|
This approach may mask certain errors, but it also helps to avoid a lot of
|
||||||
|
false positives in bitwise operations, atomics etc.
|
||||||
|
|
||||||
|
Sometimes the pointers passed into inline assembly do not point to valid memory.
|
||||||
|
In such cases they are ignored at runtime.
|
||||||
|
|
||||||
|
|
||||||
|
Runtime library
|
||||||
|
---------------
|
||||||
|
|
||||||
|
The code is located in ``mm/kmsan/``.
|
||||||
|
|
||||||
|
Per-task KMSAN state
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Every task_struct has an associated KMSAN task state that holds the KMSAN
|
||||||
|
context (see above) and a per-task flag disallowing KMSAN reports::
|
||||||
|
|
||||||
|
struct kmsan_context {
|
||||||
|
...
|
||||||
|
bool allow_reporting;
|
||||||
|
struct kmsan_context_state cstate;
|
||||||
|
...
|
||||||
|
}
|
||||||
|
|
||||||
|
struct task_struct {
|
||||||
|
...
|
||||||
|
struct kmsan_context kmsan;
|
||||||
|
...
|
||||||
|
}
|
||||||
|
|
||||||
|
KMSAN contexts
|
||||||
|
~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
When running in a kernel task context, KMSAN uses ``current->kmsan.cstate`` to
|
||||||
|
hold the metadata for function parameters and return values.
|
||||||
|
|
||||||
|
But in the case the kernel is running in the interrupt, softirq or NMI context,
|
||||||
|
where ``current`` is unavailable, KMSAN switches to per-cpu interrupt state::
|
||||||
|
|
||||||
|
DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx);
|
||||||
|
|
||||||
|
Metadata allocation
|
||||||
|
~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
There are several places in the kernel for which the metadata is stored.
|
||||||
|
|
||||||
|
1. Each ``struct page`` instance contains two pointers to its shadow and
|
||||||
|
origin pages::
|
||||||
|
|
||||||
|
struct page {
|
||||||
|
...
|
||||||
|
struct page *shadow, *origin;
|
||||||
|
...
|
||||||
|
};
|
||||||
|
|
||||||
|
At boot-time, the kernel allocates shadow and origin pages for every available
|
||||||
|
kernel page. This is done quite late, when the kernel address space is already
|
||||||
|
fragmented, so normal data pages may arbitrarily interleave with the metadata
|
||||||
|
pages.
|
||||||
|
|
||||||
|
This means that in general for two contiguous memory pages their shadow/origin
|
||||||
|
pages may not be contiguous. Consequently, if a memory access crosses the
|
||||||
|
boundary of a memory block, accesses to shadow/origin memory may potentially
|
||||||
|
corrupt other pages or read incorrect values from them.
|
||||||
|
|
||||||
|
In practice, contiguous memory pages returned by the same ``alloc_pages()``
|
||||||
|
call will have contiguous metadata, whereas if these pages belong to two
|
||||||
|
different allocations their metadata pages can be fragmented.
|
||||||
|
|
||||||
|
For the kernel data (``.data``, ``.bss`` etc.) and percpu memory regions
|
||||||
|
there also are no guarantees on metadata contiguity.
|
||||||
|
|
||||||
|
In the case ``__msan_metadata_ptr_for_XXX_YYY()`` hits the border between two
|
||||||
|
pages with non-contiguous metadata, it returns pointers to fake shadow/origin regions::
|
||||||
|
|
||||||
|
char dummy_load_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
|
||||||
|
char dummy_store_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
|
||||||
|
|
||||||
|
``dummy_load_page`` is zero-initialized, so reads from it always yield zeroes.
|
||||||
|
All stores to ``dummy_store_page`` are ignored.
|
||||||
|
|
||||||
|
2. For vmalloc memory and modules, there is a direct mapping between the memory
|
||||||
|
range, its shadow and origin. KMSAN reduces the vmalloc area by 3/4, making only
|
||||||
|
the first quarter available to ``vmalloc()``. The second quarter of the vmalloc
|
||||||
|
area contains shadow memory for the first quarter, the third one holds the
|
||||||
|
origins. A small part of the fourth quarter contains shadow and origins for the
|
||||||
|
kernel modules. Please refer to ``arch/x86/include/asm/pgtable_64_types.h`` for
|
||||||
|
more details.
|
||||||
|
|
||||||
|
When an array of pages is mapped into a contiguous virtual memory space, their
|
||||||
|
shadow and origin pages are similarly mapped into contiguous regions.
|
||||||
|
|
||||||
|
References
|
||||||
|
==========
|
||||||
|
|
||||||
|
E. Stepanov, K. Serebryany. `MemorySanitizer: fast detector of uninitialized
|
||||||
|
memory use in C++
|
||||||
|
<https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43308.pdf>`_.
|
||||||
|
In Proceedings of CGO 2015.
|
||||||
|
|
||||||
|
.. _MemorySanitizer tool: https://clang.llvm.org/docs/MemorySanitizer.html
|
||||||
|
.. _LLVM documentation: https://llvm.org/docs/GettingStarted.html
|
||||||
|
.. _LKML discussion: https://lore.kernel.org/all/20220614144853.3693273-1-glider@google.com/
|
||||||
@@ -320,7 +320,7 @@ A bare bones test module might look like this:
|
|||||||
|
|
||||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||||
|
|
||||||
#include "../tools/testing/selftests/kselftest/module.h"
|
#include "../tools/testing/selftests/kselftest_module.h"
|
||||||
|
|
||||||
KSTM_MODULE_GLOBALS();
|
KSTM_MODULE_GLOBALS();
|
||||||
|
|
||||||
|
|||||||
@@ -6,8 +6,8 @@ KUnit Architecture
|
|||||||
|
|
||||||
The KUnit architecture can be divided into two parts:
|
The KUnit architecture can be divided into two parts:
|
||||||
|
|
||||||
- Kernel testing library
|
- `In-Kernel Testing Framework`_
|
||||||
- kunit_tool (Command line test harness)
|
- `kunit_tool (Command Line Test Harness)`_
|
||||||
|
|
||||||
In-Kernel Testing Framework
|
In-Kernel Testing Framework
|
||||||
===========================
|
===========================
|
||||||
|
|||||||
@@ -31,13 +31,16 @@ For the most part, the KUnit core framework (what we use to write the tests)
|
|||||||
can compile to any architecture. It compiles like just another part of the
|
can compile to any architecture. It compiles like just another part of the
|
||||||
kernel and runs when the kernel boots, or when built as a module, when the
|
kernel and runs when the kernel boots, or when built as a module, when the
|
||||||
module is loaded. However, there is infrastructure, like the KUnit Wrapper
|
module is loaded. However, there is infrastructure, like the KUnit Wrapper
|
||||||
(``tools/testing/kunit/kunit.py``) that does not support other architectures.
|
(``tools/testing/kunit/kunit.py``) that might not support some architectures
|
||||||
|
(see :ref:`kunit-on-qemu`).
|
||||||
|
|
||||||
In short, yes, you can run KUnit on other architectures, but it might require
|
In short, yes, you can run KUnit on other architectures, but it might require
|
||||||
more work than using KUnit on UML.
|
more work than using KUnit on UML.
|
||||||
|
|
||||||
For more information, see :ref:`kunit-on-non-uml`.
|
For more information, see :ref:`kunit-on-non-uml`.
|
||||||
|
|
||||||
|
.. _kinds-of-tests:
|
||||||
|
|
||||||
What is the difference between a unit test and other kinds of tests?
|
What is the difference between a unit test and other kinds of tests?
|
||||||
====================================================================
|
====================================================================
|
||||||
Most existing tests for the Linux kernel would be categorized as an integration
|
Most existing tests for the Linux kernel would be categorized as an integration
|
||||||
@@ -95,8 +98,7 @@ things to try.
|
|||||||
seeing. When tests are built-in, they will execute when the kernel boots, and
|
seeing. When tests are built-in, they will execute when the kernel boots, and
|
||||||
modules will automatically execute associated tests when loaded. Test results
|
modules will automatically execute associated tests when loaded. Test results
|
||||||
can be collected from ``/sys/kernel/debug/kunit/<test suite>/results``, and
|
can be collected from ``/sys/kernel/debug/kunit/<test suite>/results``, and
|
||||||
can be parsed with ``kunit.py parse``. For more details, see "KUnit on
|
can be parsed with ``kunit.py parse``. For more details, see :ref:`kunit-on-qemu`.
|
||||||
non-UML architectures" in Documentation/dev-tools/kunit/usage.rst.
|
|
||||||
|
|
||||||
If none of the above tricks help, you are always welcome to email any issues to
|
If none of the above tricks help, you are always welcome to email any issues to
|
||||||
kunit-dev@googlegroups.com.
|
kunit-dev@googlegroups.com.
|
||||||
|
|||||||
@@ -13,7 +13,6 @@ KUnit - Linux Kernel Unit Testing
|
|||||||
run_wrapper
|
run_wrapper
|
||||||
run_manual
|
run_manual
|
||||||
usage
|
usage
|
||||||
kunit-tool
|
|
||||||
api/index
|
api/index
|
||||||
style
|
style
|
||||||
faq
|
faq
|
||||||
@@ -29,10 +28,10 @@ KUnit (Kernel unit testing framework) provides a common framework for
|
|||||||
unit tests within the Linux kernel. Using KUnit, you can define groups
|
unit tests within the Linux kernel. Using KUnit, you can define groups
|
||||||
of test cases called test suites. The tests either run on kernel boot
|
of test cases called test suites. The tests either run on kernel boot
|
||||||
if built-in, or load as a module. KUnit automatically flags and reports
|
if built-in, or load as a module. KUnit automatically flags and reports
|
||||||
failed test cases in the kernel log. The test results appear in `TAP
|
failed test cases in the kernel log. The test results appear in
|
||||||
(Test Anything Protocol) format <https://testanything.org/>`_. It is inspired by
|
:doc:`KTAP (Kernel - Test Anything Protocol) format</dev-tools/ktap>`.
|
||||||
JUnit, Python’s unittest.mock, and GoogleTest/GoogleMock (C++ unit testing
|
It is inspired by JUnit, Python’s unittest.mock, and GoogleTest/GoogleMock
|
||||||
framework).
|
(C++ unit testing framework).
|
||||||
|
|
||||||
KUnit tests are part of the kernel, written in the C (programming)
|
KUnit tests are part of the kernel, written in the C (programming)
|
||||||
language, and test parts of the Kernel implementation (example: a C
|
language, and test parts of the Kernel implementation (example: a C
|
||||||
@@ -46,8 +45,9 @@ internal system functionality. KUnit runs in kernel space and is not
|
|||||||
restricted to things exposed to user-space.
|
restricted to things exposed to user-space.
|
||||||
|
|
||||||
In addition, KUnit has kunit_tool, a script (``tools/testing/kunit/kunit.py``)
|
In addition, KUnit has kunit_tool, a script (``tools/testing/kunit/kunit.py``)
|
||||||
that configures the Linux kernel, runs KUnit tests under QEMU or UML (`User Mode
|
that configures the Linux kernel, runs KUnit tests under QEMU or UML
|
||||||
Linux <http://user-mode-linux.sourceforge.net/>`_), parses the test results and
|
(:doc:`User Mode Linux </virt/uml/user_mode_linux_howto_v2>`),
|
||||||
|
parses the test results and
|
||||||
displays them in a user friendly manner.
|
displays them in a user friendly manner.
|
||||||
|
|
||||||
Features
|
Features
|
||||||
@@ -95,6 +95,8 @@ Unit Testing Advantages
|
|||||||
- Improves code quality.
|
- Improves code quality.
|
||||||
- Encourages writing testable code.
|
- Encourages writing testable code.
|
||||||
|
|
||||||
|
Read also :ref:`kinds-of-tests`.
|
||||||
|
|
||||||
How do I use it?
|
How do I use it?
|
||||||
================
|
================
|
||||||
|
|
||||||
@@ -107,7 +109,5 @@ How do I use it?
|
|||||||
examples.
|
examples.
|
||||||
* Documentation/dev-tools/kunit/api/index.rst - KUnit APIs
|
* Documentation/dev-tools/kunit/api/index.rst - KUnit APIs
|
||||||
used for testing.
|
used for testing.
|
||||||
* Documentation/dev-tools/kunit/kunit-tool.rst - kunit_tool helper
|
|
||||||
script.
|
|
||||||
* Documentation/dev-tools/kunit/faq.rst - KUnit common questions and
|
* Documentation/dev-tools/kunit/faq.rst - KUnit common questions and
|
||||||
answers.
|
answers.
|
||||||
|
|||||||
@@ -1,232 +0,0 @@
|
|||||||
.. SPDX-License-Identifier: GPL-2.0
|
|
||||||
|
|
||||||
=================
|
|
||||||
kunit_tool How-To
|
|
||||||
=================
|
|
||||||
|
|
||||||
What is kunit_tool?
|
|
||||||
===================
|
|
||||||
|
|
||||||
kunit_tool is a script (``tools/testing/kunit/kunit.py``) that aids in building
|
|
||||||
the Linux kernel as UML (`User Mode Linux
|
|
||||||
<http://user-mode-linux.sourceforge.net/>`_), running KUnit tests, parsing
|
|
||||||
the test results and displaying them in a user friendly manner.
|
|
||||||
|
|
||||||
kunit_tool addresses the problem of being able to run tests without needing a
|
|
||||||
virtual machine or actual hardware with User Mode Linux. User Mode Linux is a
|
|
||||||
Linux architecture, like ARM or x86; however, unlike other architectures it
|
|
||||||
compiles the kernel as a standalone Linux executable that can be run like any
|
|
||||||
other program directly inside of a host operating system. To be clear, it does
|
|
||||||
not require any virtualization support: it is just a regular program.
|
|
||||||
|
|
||||||
What is a .kunitconfig?
|
|
||||||
=======================
|
|
||||||
|
|
||||||
It's just a defconfig that kunit_tool looks for in the build directory
|
|
||||||
(``.kunit`` by default). kunit_tool uses it to generate a .config as you might
|
|
||||||
expect. In addition, it verifies that the generated .config contains the CONFIG
|
|
||||||
options in the .kunitconfig; the reason it does this is so that it is easy to
|
|
||||||
be sure that a CONFIG that enables a test actually ends up in the .config.
|
|
||||||
|
|
||||||
It's also possible to pass a separate .kunitconfig fragment to kunit_tool,
|
|
||||||
which is useful if you have several different groups of tests you wish
|
|
||||||
to run independently, or if you want to use pre-defined test configs for
|
|
||||||
certain subsystems.
|
|
||||||
|
|
||||||
Getting Started with kunit_tool
|
|
||||||
===============================
|
|
||||||
|
|
||||||
If a kunitconfig is present at the root directory, all you have to do is:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
./tools/testing/kunit/kunit.py run
|
|
||||||
|
|
||||||
However, you most likely want to use it with the following options:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
./tools/testing/kunit/kunit.py run --timeout=30 --jobs=`nproc --all`
|
|
||||||
|
|
||||||
- ``--timeout`` sets a maximum amount of time to allow tests to run.
|
|
||||||
- ``--jobs`` sets the number of threads to use to build the kernel.
|
|
||||||
|
|
||||||
.. note::
|
|
||||||
This command will work even without a .kunitconfig file: if no
|
|
||||||
.kunitconfig is present, a default one will be used instead.
|
|
||||||
|
|
||||||
If you wish to use a different .kunitconfig file (such as one provided for
|
|
||||||
testing a particular subsystem), you can pass it as an option.
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
./tools/testing/kunit/kunit.py run --kunitconfig=fs/ext4/.kunitconfig
|
|
||||||
|
|
||||||
For a list of all the flags supported by kunit_tool, you can run:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
./tools/testing/kunit/kunit.py run --help
|
|
||||||
|
|
||||||
Configuring, Building, and Running Tests
|
|
||||||
========================================
|
|
||||||
|
|
||||||
It's also possible to run just parts of the KUnit build process independently,
|
|
||||||
which is useful if you want to make manual changes to part of the process.
|
|
||||||
|
|
||||||
A .config can be generated from a .kunitconfig by using the ``config`` argument
|
|
||||||
when running kunit_tool:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
./tools/testing/kunit/kunit.py config
|
|
||||||
|
|
||||||
Similarly, if you just want to build a KUnit kernel from the current .config,
|
|
||||||
you can use the ``build`` argument:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
./tools/testing/kunit/kunit.py build
|
|
||||||
|
|
||||||
And, if you already have a built UML kernel with built-in KUnit tests, you can
|
|
||||||
run the kernel and display the test results with the ``exec`` argument:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
./tools/testing/kunit/kunit.py exec
|
|
||||||
|
|
||||||
The ``run`` command which is discussed above is equivalent to running all three
|
|
||||||
of these in sequence.
|
|
||||||
|
|
||||||
All of these commands accept a number of optional command-line arguments. The
|
|
||||||
``--help`` flag will give a complete list of these, or keep reading this page
|
|
||||||
for a guide to some of the more useful ones.
|
|
||||||
|
|
||||||
Parsing Test Results
|
|
||||||
====================
|
|
||||||
|
|
||||||
KUnit tests output their results in TAP (Test Anything Protocol) format.
|
|
||||||
kunit_tool will, when running tests, parse this output and print a summary
|
|
||||||
which is much more pleasant to read. If you wish to look at the raw test
|
|
||||||
results in TAP format, you can pass the ``--raw_output`` argument.
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
./tools/testing/kunit/kunit.py run --raw_output
|
|
||||||
|
|
||||||
The raw output from test runs may contain other, non-KUnit kernel log
|
|
||||||
lines. You can see just KUnit output with ``--raw_output=kunit``:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
./tools/testing/kunit/kunit.py run --raw_output=kunit
|
|
||||||
|
|
||||||
If you have KUnit results in their raw TAP format, you can parse them and print
|
|
||||||
the human-readable summary with the ``parse`` command for kunit_tool. This
|
|
||||||
accepts a filename for an argument, or will read from standard input.
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
# Reading from a file
|
|
||||||
./tools/testing/kunit/kunit.py parse /var/log/dmesg
|
|
||||||
# Reading from stdin
|
|
||||||
dmesg | ./tools/testing/kunit/kunit.py parse
|
|
||||||
|
|
||||||
This is very useful if you wish to run tests in a configuration not supported
|
|
||||||
by kunit_tool (such as on real hardware, or an unsupported architecture).
|
|
||||||
|
|
||||||
Filtering Tests
|
|
||||||
===============
|
|
||||||
|
|
||||||
It's possible to run only a subset of the tests built into a kernel by passing
|
|
||||||
a filter to the ``exec`` or ``run`` commands. For example, if you only wanted
|
|
||||||
to run KUnit resource tests, you could use:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
./tools/testing/kunit/kunit.py run 'kunit-resource*'
|
|
||||||
|
|
||||||
This uses the standard glob format for wildcards.
|
|
||||||
|
|
||||||
Running Tests on QEMU
|
|
||||||
=====================
|
|
||||||
|
|
||||||
kunit_tool supports running tests on QEMU as well as via UML (as mentioned
|
|
||||||
elsewhere). The default way of running tests on QEMU requires two flags:
|
|
||||||
|
|
||||||
``--arch``
|
|
||||||
Selects a collection of configs (Kconfig as well as QEMU configs
|
|
||||||
options, etc) that allow KUnit tests to be run on the specified
|
|
||||||
architecture in a minimal way; this is usually not much slower than
|
|
||||||
using UML. The architecture argument is the same as the name of the
|
|
||||||
option passed to the ``ARCH`` variable used by Kbuild. Not all
|
|
||||||
architectures are currently supported by this flag, but can be handled
|
|
||||||
by the ``--qemu_config`` discussed later. If ``um`` is passed (or this
|
|
||||||
this flag is ignored) the tests will run via UML. Non-UML architectures,
|
|
||||||
e.g. i386, x86_64, arm, um, etc. Non-UML run on QEMU.
|
|
||||||
|
|
||||||
``--cross_compile``
|
|
||||||
Specifies the use of a toolchain by Kbuild. The argument passed here is
|
|
||||||
the same passed to the ``CROSS_COMPILE`` variable used by Kbuild. As a
|
|
||||||
reminder this will be the prefix for the toolchain binaries such as gcc
|
|
||||||
for example ``sparc64-linux-gnu-`` if you have the sparc toolchain
|
|
||||||
installed on your system, or
|
|
||||||
``$HOME/toolchains/microblaze/gcc-9.2.0-nolibc/microblaze-linux/bin/microblaze-linux-``
|
|
||||||
if you have downloaded the microblaze toolchain from the 0-day website
|
|
||||||
to a directory in your home directory called ``toolchains``.
|
|
||||||
|
|
||||||
In many cases it is likely that you may want to run an architecture which is
|
|
||||||
not supported by the ``--arch`` flag, or you may want to just run KUnit tests
|
|
||||||
on QEMU using a non-default configuration. For this use case, you can write
|
|
||||||
your own QemuConfig. These QemuConfigs are written in Python. They must have an
|
|
||||||
import line ``from ..qemu_config import QemuArchParams`` at the top of the file
|
|
||||||
and the file must contain a variable called ``QEMU_ARCH`` that has an instance
|
|
||||||
of ``QemuArchParams`` assigned to it. An example can be seen in
|
|
||||||
``tools/testing/kunit/qemu_configs/x86_64.py``.
|
|
||||||
|
|
||||||
Once you have a QemuConfig you can pass it into kunit_tool using the
|
|
||||||
``--qemu_config`` flag; when used this flag replaces the ``--arch`` flag. If we
|
|
||||||
were to do this with the ``x86_64.py`` example from above, the invocation would
|
|
||||||
look something like this:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
./tools/testing/kunit/kunit.py run \
|
|
||||||
--timeout=60 \
|
|
||||||
--jobs=12 \
|
|
||||||
--qemu_config=./tools/testing/kunit/qemu_configs/x86_64.py
|
|
||||||
|
|
||||||
Other Useful Options
|
|
||||||
====================
|
|
||||||
|
|
||||||
kunit_tool has a number of other command-line arguments which can be useful
|
|
||||||
when adapting it to fit your environment or needs.
|
|
||||||
|
|
||||||
Some of the more useful ones are:
|
|
||||||
|
|
||||||
``--help``
|
|
||||||
Lists all of the available options. Note that different commands
|
|
||||||
(``config``, ``build``, ``run``, etc) will have different supported
|
|
||||||
options. Place ``--help`` before the command to list common options,
|
|
||||||
and after the command for options specific to that command.
|
|
||||||
|
|
||||||
``--build_dir``
|
|
||||||
Specifies the build directory that kunit_tool will use. This is where
|
|
||||||
the .kunitconfig file is located, as well as where the .config and
|
|
||||||
compiled kernel will be placed. Defaults to ``.kunit``.
|
|
||||||
|
|
||||||
``--make_options``
|
|
||||||
Specifies additional options to pass to ``make`` when compiling a
|
|
||||||
kernel (with the ``build`` or ``run`` commands). For example, to enable
|
|
||||||
compiler warnings, you can pass ``--make_options W=1``.
|
|
||||||
|
|
||||||
``--alltests``
|
|
||||||
Builds a UML kernel with all config options enabled using ``make
|
|
||||||
allyesconfig``. This allows you to run as many tests as is possible,
|
|
||||||
but is very slow and prone to breakage as new options are added or
|
|
||||||
modified. In most cases, enabling all tests which have satisfied
|
|
||||||
dependencies by adding ``CONFIG_KUNIT_ALL_TESTS=1`` to your
|
|
||||||
.kunitconfig is preferable.
|
|
||||||
|
|
||||||
There are several other options (and new ones are often added), so do check
|
|
||||||
``--help`` if you're looking for something not mentioned here.
|
|
||||||
@@ -1,8 +1,8 @@
|
|||||||
.. SPDX-License-Identifier: GPL-2.0
|
.. SPDX-License-Identifier: GPL-2.0
|
||||||
|
|
||||||
=========================
|
=============================
|
||||||
Run Tests with kunit_tool
|
Running tests with kunit_tool
|
||||||
=========================
|
=============================
|
||||||
|
|
||||||
We can either run KUnit tests using kunit_tool or can run tests
|
We can either run KUnit tests using kunit_tool or can run tests
|
||||||
manually, and then use kunit_tool to parse the results. To run tests
|
manually, and then use kunit_tool to parse the results. To run tests
|
||||||
@@ -22,7 +22,7 @@ We should see the following:
|
|||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
Generating .config...
|
Configuring KUnit Kernel ...
|
||||||
Building KUnit kernel...
|
Building KUnit kernel...
|
||||||
Starting KUnit kernel...
|
Starting KUnit kernel...
|
||||||
|
|
||||||
@@ -30,7 +30,7 @@ We may want to use the following options:
|
|||||||
|
|
||||||
.. code-block::
|
.. code-block::
|
||||||
|
|
||||||
./tools/testing/kunit/kunit.py run --timeout=30 --jobs=`nproc --all
|
./tools/testing/kunit/kunit.py run --timeout=30 --jobs=`nproc --all`
|
||||||
|
|
||||||
- ``--timeout`` sets a maximum amount of time for tests to run.
|
- ``--timeout`` sets a maximum amount of time for tests to run.
|
||||||
- ``--jobs`` sets the number of threads to build the kernel.
|
- ``--jobs`` sets the number of threads to build the kernel.
|
||||||
@@ -58,8 +58,8 @@ To view kunit_tool flags (optional command-line arguments), run:
|
|||||||
|
|
||||||
./tools/testing/kunit/kunit.py run --help
|
./tools/testing/kunit/kunit.py run --help
|
||||||
|
|
||||||
Create a ``.kunitconfig`` File
|
Creating a ``.kunitconfig`` file
|
||||||
===============================
|
================================
|
||||||
|
|
||||||
If we want to run a specific set of tests (rather than those listed
|
If we want to run a specific set of tests (rather than those listed
|
||||||
in the KUnit ``defconfig``), we can provide Kconfig options in the
|
in the KUnit ``defconfig``), we can provide Kconfig options in the
|
||||||
@@ -98,8 +98,8 @@ have not included the options dependencies.
|
|||||||
The build dir needs to be set for ``make menuconfig`` to
|
The build dir needs to be set for ``make menuconfig`` to
|
||||||
work, therefore by default use ``make O=.kunit menuconfig``.
|
work, therefore by default use ``make O=.kunit menuconfig``.
|
||||||
|
|
||||||
Configure, Build, and Run Tests
|
Configuring, building, and running tests
|
||||||
===============================
|
========================================
|
||||||
|
|
||||||
If we want to make manual changes to the KUnit build process, we
|
If we want to make manual changes to the KUnit build process, we
|
||||||
can run part of the KUnit build process independently.
|
can run part of the KUnit build process independently.
|
||||||
@@ -125,11 +125,11 @@ argument:
|
|||||||
|
|
||||||
./tools/testing/kunit/kunit.py exec
|
./tools/testing/kunit/kunit.py exec
|
||||||
|
|
||||||
The ``run`` command discussed in section: **Run Tests with kunit_tool**,
|
The ``run`` command discussed in section: **Running tests with kunit_tool**,
|
||||||
is equivalent to running the above three commands in sequence.
|
is equivalent to running the above three commands in sequence.
|
||||||
|
|
||||||
Parse Test Results
|
Parsing test results
|
||||||
==================
|
====================
|
||||||
|
|
||||||
KUnit tests output displays results in TAP (Test Anything Protocol)
|
KUnit tests output displays results in TAP (Test Anything Protocol)
|
||||||
format. When running tests, kunit_tool parses this output and prints
|
format. When running tests, kunit_tool parses this output and prints
|
||||||
@@ -152,8 +152,8 @@ standard input.
|
|||||||
# Reading from stdin
|
# Reading from stdin
|
||||||
dmesg | ./tools/testing/kunit/kunit.py parse
|
dmesg | ./tools/testing/kunit/kunit.py parse
|
||||||
|
|
||||||
Run Selected Test Suites
|
Filtering tests
|
||||||
========================
|
===============
|
||||||
|
|
||||||
By passing a bash style glob filter to the ``exec`` or ``run``
|
By passing a bash style glob filter to the ``exec`` or ``run``
|
||||||
commands, we can run a subset of the tests built into a kernel . For
|
commands, we can run a subset of the tests built into a kernel . For
|
||||||
@@ -165,8 +165,10 @@ example: if we only want to run KUnit resource tests, use:
|
|||||||
|
|
||||||
This uses the standard glob format with wildcard characters.
|
This uses the standard glob format with wildcard characters.
|
||||||
|
|
||||||
Run Tests on qemu
|
.. _kunit-on-qemu:
|
||||||
=================
|
|
||||||
|
Running tests on QEMU
|
||||||
|
=====================
|
||||||
|
|
||||||
kunit_tool supports running tests on qemu as well as
|
kunit_tool supports running tests on qemu as well as
|
||||||
via UML. To run tests on qemu, by default it requires two flags:
|
via UML. To run tests on qemu, by default it requires two flags:
|
||||||
@@ -229,8 +231,8 @@ as
|
|||||||
--jobs=12 \
|
--jobs=12 \
|
||||||
--qemu_config=./tools/testing/kunit/qemu_configs/x86_64.py
|
--qemu_config=./tools/testing/kunit/qemu_configs/x86_64.py
|
||||||
|
|
||||||
Command-Line Arguments
|
Running command-line arguments
|
||||||
======================
|
==============================
|
||||||
|
|
||||||
kunit_tool has a number of other command-line arguments which can
|
kunit_tool has a number of other command-line arguments which can
|
||||||
be useful for our test environment. Below are the most commonly used
|
be useful for our test environment. Below are the most commonly used
|
||||||
@@ -249,14 +251,15 @@ command line arguments:
|
|||||||
compiling a kernel (using ``build`` or ``run`` commands). For example:
|
compiling a kernel (using ``build`` or ``run`` commands). For example:
|
||||||
to enable compiler warnings, we can pass ``--make_options W=1``.
|
to enable compiler warnings, we can pass ``--make_options W=1``.
|
||||||
|
|
||||||
- ``--alltests``: Builds a UML kernel with all config options enabled
|
- ``--alltests``: Enable a predefined set of options in order to build
|
||||||
using ``make allyesconfig``. This allows us to run as many tests as
|
as many tests as possible.
|
||||||
possible.
|
|
||||||
|
|
||||||
.. note:: It is slow and prone to breakage as new options are
|
.. note:: The list of enabled options can be found in
|
||||||
added or modified. Instead, enable all tests
|
``tools/testing/kunit/configs/all_tests.config``.
|
||||||
which have satisfied dependencies by adding
|
|
||||||
``CONFIG_KUNIT_ALL_TESTS=y`` to your ``.kunitconfig``.
|
If you only want to enable all tests with otherwise satisfied
|
||||||
|
dependencies, instead add ``CONFIG_KUNIT_ALL_TESTS=y`` to your
|
||||||
|
``.kunitconfig``.
|
||||||
|
|
||||||
- ``--kunitconfig``: Specifies the path or the directory of the ``.kunitconfig``
|
- ``--kunitconfig``: Specifies the path or the directory of the ``.kunitconfig``
|
||||||
file. For example:
|
file. For example:
|
||||||
|
|||||||
@@ -4,6 +4,10 @@
|
|||||||
Getting Started
|
Getting Started
|
||||||
===============
|
===============
|
||||||
|
|
||||||
|
This page contains an overview of the kunit_tool and KUnit framework,
|
||||||
|
teaching how to run existing tests and then how to write a simple test case,
|
||||||
|
and covers common problems users face when using KUnit for the first time.
|
||||||
|
|
||||||
Installing Dependencies
|
Installing Dependencies
|
||||||
=======================
|
=======================
|
||||||
KUnit has the same dependencies as the Linux kernel. As long as you can
|
KUnit has the same dependencies as the Linux kernel. As long as you can
|
||||||
@@ -19,30 +23,53 @@ can run kunit_tool:
|
|||||||
|
|
||||||
./tools/testing/kunit/kunit.py run
|
./tools/testing/kunit/kunit.py run
|
||||||
|
|
||||||
For more information on this wrapper, see:
|
.. note ::
|
||||||
|
You may see the following error:
|
||||||
|
"The source tree is not clean, please run 'make ARCH=um mrproper'"
|
||||||
|
|
||||||
|
This happens because internally kunit.py specifies ``.kunit``
|
||||||
|
(default option) as the build directory in the command ``make O=output/dir``
|
||||||
|
through the argument ``--build_dir``. Hence, before starting an
|
||||||
|
out-of-tree build, the source tree must be clean.
|
||||||
|
|
||||||
|
There is also the same caveat mentioned in the "Build directory for
|
||||||
|
the kernel" section of the :doc:`admin-guide </admin-guide/README>`,
|
||||||
|
that is, its use, it must be used for all invocations of ``make``.
|
||||||
|
The good news is that it can indeed be solved by running
|
||||||
|
``make ARCH=um mrproper``, just be aware that this will delete the
|
||||||
|
current configuration and all generated files.
|
||||||
|
|
||||||
|
If everything worked correctly, you should see the following:
|
||||||
|
|
||||||
|
.. code-block::
|
||||||
|
|
||||||
|
Configuring KUnit Kernel ...
|
||||||
|
Building KUnit Kernel ...
|
||||||
|
Starting KUnit Kernel ...
|
||||||
|
|
||||||
|
The tests will pass or fail.
|
||||||
|
|
||||||
|
.. note ::
|
||||||
|
Because it is building a lot of sources for the first time,
|
||||||
|
the ``Building KUnit Kernel`` step may take a while.
|
||||||
|
|
||||||
|
For detailed information on this wrapper, see:
|
||||||
Documentation/dev-tools/kunit/run_wrapper.rst.
|
Documentation/dev-tools/kunit/run_wrapper.rst.
|
||||||
|
|
||||||
Creating a ``.kunitconfig``
|
Selecting which tests to run
|
||||||
---------------------------
|
----------------------------
|
||||||
|
|
||||||
By default, kunit_tool runs a selection of tests. However, you can specify which
|
By default, kunit_tool runs all tests reachable with minimal configuration,
|
||||||
unit tests to run by creating a ``.kunitconfig`` file with kernel config options
|
that is, using default values for most of the kconfig options. However,
|
||||||
that enable only a specific set of tests and their dependencies.
|
you can select which tests to run by:
|
||||||
The ``.kunitconfig`` file contains a list of kconfig options which are required
|
|
||||||
to run the desired targets. The ``.kunitconfig`` also contains any other test
|
|
||||||
specific config options, such as test dependencies. For example: the
|
|
||||||
``FAT_FS`` tests - ``FAT_KUNIT_TEST``, depends on
|
|
||||||
``FAT_FS``. ``FAT_FS`` can be enabled by selecting either ``MSDOS_FS``
|
|
||||||
or ``VFAT_FS``. To run ``FAT_KUNIT_TEST``, the ``.kunitconfig`` has:
|
|
||||||
|
|
||||||
.. code-block:: none
|
- `Customizing Kconfig`_ used to compile the kernel, or
|
||||||
|
- `Filtering tests by name`_ to select specifically which compiled tests to run.
|
||||||
|
|
||||||
CONFIG_KUNIT=y
|
Customizing Kconfig
|
||||||
CONFIG_MSDOS_FS=y
|
~~~~~~~~~~~~~~~~~~~
|
||||||
CONFIG_FAT_KUNIT_TEST=y
|
A good starting point for the ``.kunitconfig`` is the KUnit default config.
|
||||||
|
If you didn't run ``kunit.py run`` yet, you can generate it by running:
|
||||||
1. A good starting point for the ``.kunitconfig`` is the KUnit default config.
|
|
||||||
You can generate it by running:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
.. code-block:: bash
|
||||||
|
|
||||||
@@ -54,48 +81,69 @@ or ``VFAT_FS``. To run ``FAT_KUNIT_TEST``, the ``.kunitconfig`` has:
|
|||||||
``.kunitconfig`` lives in the ``--build_dir`` used by kunit.py, which is
|
``.kunitconfig`` lives in the ``--build_dir`` used by kunit.py, which is
|
||||||
``.kunit`` by default.
|
``.kunit`` by default.
|
||||||
|
|
||||||
.. note ::
|
|
||||||
You may want to remove CONFIG_KUNIT_ALL_TESTS from the ``.kunitconfig`` as
|
|
||||||
it will enable a number of additional tests that you may not want.
|
|
||||||
|
|
||||||
2. You can then add any other Kconfig options, for example:
|
|
||||||
|
|
||||||
.. code-block:: none
|
|
||||||
|
|
||||||
CONFIG_LIST_KUNIT_TEST=y
|
|
||||||
|
|
||||||
Before running the tests, kunit_tool ensures that all config options
|
Before running the tests, kunit_tool ensures that all config options
|
||||||
set in ``.kunitconfig`` are set in the kernel ``.config``. It will warn
|
set in ``.kunitconfig`` are set in the kernel ``.config``. It will warn
|
||||||
you if you have not included dependencies for the options used.
|
you if you have not included dependencies for the options used.
|
||||||
|
|
||||||
.. note ::
|
There are many ways to customize the configurations:
|
||||||
If you change the ``.kunitconfig``, kunit.py will trigger a rebuild of the
|
|
||||||
|
a. Edit ``.kunit/.kunitconfig``. The file should contain the list of kconfig
|
||||||
|
options required to run the desired tests, including their dependencies.
|
||||||
|
You may want to remove CONFIG_KUNIT_ALL_TESTS from the ``.kunitconfig`` as
|
||||||
|
it will enable a number of additional tests that you may not want.
|
||||||
|
If you need to run on an architecture other than UML see :ref:`kunit-on-qemu`.
|
||||||
|
|
||||||
|
b. Enable additional kconfig options on top of ``.kunit/.kunitconfig``.
|
||||||
|
For example, to include the kernel's linked-list test you can run::
|
||||||
|
|
||||||
|
./tools/testing/kunit/kunit.py run \
|
||||||
|
--kconfig_add CONFIG_LIST_KUNIT_TEST=y
|
||||||
|
|
||||||
|
c. Provide the path of one or more .kunitconfig files from the tree.
|
||||||
|
For example, to run only ``FAT_FS`` and ``EXT4`` tests you can run::
|
||||||
|
|
||||||
|
./tools/testing/kunit/kunit.py run \
|
||||||
|
--kunitconfig ./fs/fat/.kunitconfig \
|
||||||
|
--kunitconfig ./fs/ext4/.kunitconfig
|
||||||
|
|
||||||
|
d. If you change the ``.kunitconfig``, kunit.py will trigger a rebuild of the
|
||||||
``.config`` file. But you can edit the ``.config`` file directly or with
|
``.config`` file. But you can edit the ``.config`` file directly or with
|
||||||
tools like ``make menuconfig O=.kunit``. As long as its a superset of
|
tools like ``make menuconfig O=.kunit``. As long as its a superset of
|
||||||
``.kunitconfig``, kunit.py won't overwrite your changes.
|
``.kunitconfig``, kunit.py won't overwrite your changes.
|
||||||
|
|
||||||
Running Tests (KUnit Wrapper)
|
|
||||||
-----------------------------
|
|
||||||
1. To make sure that everything is set up correctly, invoke the Python
|
|
||||||
wrapper from your kernel repository:
|
|
||||||
|
|
||||||
.. code-block:: bash
|
|
||||||
|
|
||||||
./tools/testing/kunit/kunit.py run
|
|
||||||
|
|
||||||
If everything worked correctly, you should see the following:
|
|
||||||
|
|
||||||
.. code-block::
|
|
||||||
|
|
||||||
Generating .config ...
|
|
||||||
Building KUnit Kernel ...
|
|
||||||
Starting KUnit Kernel ...
|
|
||||||
|
|
||||||
The tests will pass or fail.
|
|
||||||
|
|
||||||
.. note ::
|
.. note ::
|
||||||
Because it is building a lot of sources for the first time, the
|
|
||||||
``Building KUnit kernel`` may take a while.
|
To save a .kunitconfig after finding a satisfactory configuration::
|
||||||
|
|
||||||
|
make savedefconfig O=.kunit
|
||||||
|
cp .kunit/defconfig .kunit/.kunitconfig
|
||||||
|
|
||||||
|
Filtering tests by name
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
If you want to be more specific than Kconfig can provide, it is also possible
|
||||||
|
to select which tests to execute at boot-time by passing a glob filter
|
||||||
|
(read instructions regarding the pattern in the manpage :manpage:`glob(7)`).
|
||||||
|
If there is a ``"."`` (period) in the filter, it will be interpreted as a
|
||||||
|
separator between the name of the test suite and the test case,
|
||||||
|
otherwise, it will be interpreted as the name of the test suite.
|
||||||
|
For example, let's assume we are using the default config:
|
||||||
|
|
||||||
|
a. inform the name of a test suite, like ``"kunit_executor_test"``,
|
||||||
|
to run every test case it contains::
|
||||||
|
|
||||||
|
./tools/testing/kunit/kunit.py run "kunit_executor_test"
|
||||||
|
|
||||||
|
b. inform the name of a test case prefixed by its test suite,
|
||||||
|
like ``"example.example_simple_test"``, to run specifically that test case::
|
||||||
|
|
||||||
|
./tools/testing/kunit/kunit.py run "example.example_simple_test"
|
||||||
|
|
||||||
|
c. use wildcard characters (``*?[``) to run any test case that matches the pattern,
|
||||||
|
like ``"*.*64*"`` to run test cases containing ``"64"`` in the name inside
|
||||||
|
any test suite::
|
||||||
|
|
||||||
|
./tools/testing/kunit/kunit.py run "*.*64*"
|
||||||
|
|
||||||
Running Tests without the KUnit Wrapper
|
Running Tests without the KUnit Wrapper
|
||||||
=======================================
|
=======================================
|
||||||
@@ -217,7 +265,7 @@ Now we are ready to write the test cases.
|
|||||||
|
|
||||||
obj-$(CONFIG_MISC_EXAMPLE_TEST) += example_test.o
|
obj-$(CONFIG_MISC_EXAMPLE_TEST) += example_test.o
|
||||||
|
|
||||||
4. Add the following lines to ``.kunitconfig``:
|
4. Add the following lines to ``.kunit/.kunitconfig``:
|
||||||
|
|
||||||
.. code-block:: none
|
.. code-block:: none
|
||||||
|
|
||||||
@@ -254,7 +302,5 @@ Next Steps
|
|||||||
examples.
|
examples.
|
||||||
* Documentation/dev-tools/kunit/api/index.rst - KUnit APIs
|
* Documentation/dev-tools/kunit/api/index.rst - KUnit APIs
|
||||||
used for testing.
|
used for testing.
|
||||||
* Documentation/dev-tools/kunit/kunit-tool.rst - kunit_tool helper
|
|
||||||
script.
|
|
||||||
* Documentation/dev-tools/kunit/faq.rst - KUnit common questions and
|
* Documentation/dev-tools/kunit/faq.rst - KUnit common questions and
|
||||||
answers.
|
answers.
|
||||||
|
|||||||
@@ -165,6 +165,8 @@ built as a module).
|
|||||||
|
|
||||||
For more information, see Documentation/dev-tools/kunit/api/test.rst.
|
For more information, see Documentation/dev-tools/kunit/api/test.rst.
|
||||||
|
|
||||||
|
.. _kunit-on-non-uml:
|
||||||
|
|
||||||
Writing Tests For Other Architectures
|
Writing Tests For Other Architectures
|
||||||
-------------------------------------
|
-------------------------------------
|
||||||
|
|
||||||
@@ -544,8 +546,6 @@ By reusing the same ``cases`` array from above, we can write the test as a
|
|||||||
{}
|
{}
|
||||||
};
|
};
|
||||||
|
|
||||||
.. _kunit-on-non-uml:
|
|
||||||
|
|
||||||
Exiting Early on Failed Expectations
|
Exiting Early on Failed Expectations
|
||||||
------------------------------------
|
------------------------------------
|
||||||
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user