mirror of
https://github.com/lkl/linux.git
synced 2025-12-19 16:13:19 +09:00
Merge drm/drm-next into drm-misc-next
Let's kick-off this release cycle. Signed-off-by: Maxime Ripard <maxime@cerno.tech>
This commit is contained in:
@@ -1,2 +1,4 @@
|
||||
Alan Cox <alan@lxorguk.ukuu.org.uk>
|
||||
Alan Cox <root@hraefn.swansea.linux.org.uk>
|
||||
Christoph Hellwig <hch@lst.de>
|
||||
Marc Gonzalez <marc.w.gonzalez@free.fr>
|
||||
|
||||
6
.gitignore
vendored
6
.gitignore
vendored
@@ -37,6 +37,8 @@
|
||||
*.o
|
||||
*.o.*
|
||||
*.patch
|
||||
*.rmeta
|
||||
*.rsi
|
||||
*.s
|
||||
*.so
|
||||
*.so.dbg
|
||||
@@ -97,6 +99,7 @@ modules.order
|
||||
!.gitattributes
|
||||
!.gitignore
|
||||
!.mailmap
|
||||
!.rustfmt.toml
|
||||
|
||||
#
|
||||
# Generated include files
|
||||
@@ -162,3 +165,6 @@ x509.genkey
|
||||
|
||||
# Documentation toolchain
|
||||
sphinx_*/
|
||||
|
||||
# Rust analyzer configuration
|
||||
/rust-project.json
|
||||
|
||||
12
.mailmap
12
.mailmap
@@ -71,6 +71,9 @@ Ben M Cahill <ben.m.cahill@intel.com>
|
||||
Ben Widawsky <bwidawsk@kernel.org> <ben@bwidawsk.net>
|
||||
Ben Widawsky <bwidawsk@kernel.org> <ben.widawsky@intel.com>
|
||||
Ben Widawsky <bwidawsk@kernel.org> <benjamin.widawsky@intel.com>
|
||||
Bjorn Andersson <andersson@kernel.org> <bjorn@kryo.se>
|
||||
Bjorn Andersson <andersson@kernel.org> <bjorn.andersson@linaro.org>
|
||||
Bjorn Andersson <andersson@kernel.org> <bjorn.andersson@sonymobile.com>
|
||||
Björn Steinbrink <B.Steinbrink@gmx.de>
|
||||
Björn Töpel <bjorn@kernel.org> <bjorn.topel@gmail.com>
|
||||
Björn Töpel <bjorn@kernel.org> <bjorn.topel@intel.com>
|
||||
@@ -98,8 +101,7 @@ Christian Brauner <brauner@kernel.org> <christian.brauner@ubuntu.com>
|
||||
Christian Marangi <ansuelsmth@gmail.com>
|
||||
Christophe Ricard <christophe.ricard@gmail.com>
|
||||
Christoph Hellwig <hch@lst.de>
|
||||
Colin Ian King <colin.king@intel.com> <colin.king@canonical.com>
|
||||
Colin Ian King <colin.king@intel.com> <colin.i.king@gmail.com>
|
||||
Colin Ian King <colin.i.king@gmail.com> <colin.king@canonical.com>
|
||||
Corey Minyard <minyard@acm.org>
|
||||
Damian Hobson-Garcia <dhobsong@igel.co.jp>
|
||||
Daniel Borkmann <daniel@iogearbox.net> <danborkmann@googlemail.com>
|
||||
@@ -135,6 +137,7 @@ Filipe Lautert <filipe@icewall.org>
|
||||
Finn Thain <fthain@linux-m68k.org> <fthain@telegraphics.com.au>
|
||||
Franck Bui-Huu <vagabon.xyz@gmail.com>
|
||||
Frank Rowand <frowand.list@gmail.com> <frank.rowand@am.sony.com>
|
||||
Frank Rowand <frowand.list@gmail.com> <frank.rowand@sony.com>
|
||||
Frank Rowand <frowand.list@gmail.com> <frank.rowand@sonymobile.com>
|
||||
Frank Rowand <frowand.list@gmail.com> <frowand@mvista.com>
|
||||
Frank Zago <fzago@systemfabricworks.com>
|
||||
@@ -150,6 +153,8 @@ Greg Kroah-Hartman <gregkh@suse.de>
|
||||
Greg Kroah-Hartman <greg@kroah.com>
|
||||
Greg Kurz <groug@kaod.org> <gkurz@linux.vnet.ibm.com>
|
||||
Gregory CLEMENT <gregory.clement@bootlin.com> <gregory.clement@free-electrons.com>
|
||||
Guilherme G. Piccoli <kernel@gpiccoli.net> <gpiccoli@linux.vnet.ibm.com>
|
||||
Guilherme G. Piccoli <kernel@gpiccoli.net> <gpiccoli@canonical.com>
|
||||
Guo Ren <guoren@kernel.org> <guoren@linux.alibaba.com>
|
||||
Guo Ren <guoren@kernel.org> <ren_guo@c-sky.com>
|
||||
Gustavo Padovan <gustavo@las.ic.unicamp.br>
|
||||
@@ -253,6 +258,7 @@ Linus Lüssing <linus.luessing@c0d3.blue> <linus.luessing@web.de>
|
||||
Li Yang <leoyang.li@nxp.com> <leoli@freescale.com>
|
||||
Li Yang <leoyang.li@nxp.com> <leo@zh-kernel.org>
|
||||
Lorenzo Pieralisi <lpieralisi@kernel.org> <lorenzo.pieralisi@arm.com>
|
||||
Luca Ceresoli <luca.ceresoli@bootlin.com> <luca@lucaceresoli.net>
|
||||
Lukasz Luba <lukasz.luba@arm.com> <l.luba@partner.samsung.com>
|
||||
Maciej W. Rozycki <macro@mips.com> <macro@imgtec.com>
|
||||
Maciej W. Rozycki <macro@orcam.me.uk> <macro@linux-mips.org>
|
||||
@@ -313,6 +319,7 @@ Morten Welinder <welinder@troll.com>
|
||||
Mythri P K <mythripk@ti.com>
|
||||
Nadia Yvette Chambers <nyc@holomorphy.com> William Lee Irwin III <wli@holomorphy.com>
|
||||
Nathan Chancellor <nathan@kernel.org> <natechancellor@gmail.com>
|
||||
Neil Armstrong <neil.armstrong@linaro.org> <narmstrong@baylibre.com>
|
||||
Nguyen Anh Quynh <aquynh@gmail.com>
|
||||
Nicholas Piggin <npiggin@gmail.com> <npiggen@suse.de>
|
||||
Nicholas Piggin <npiggin@gmail.com> <npiggin@kernel.dk>
|
||||
@@ -330,6 +337,7 @@ Oleksij Rempel <linux@rempel-privat.de> <external.Oleksij.Rempel@de.bosch.com>
|
||||
Oleksij Rempel <linux@rempel-privat.de> <fixed-term.Oleksij.Rempel@de.bosch.com>
|
||||
Oleksij Rempel <linux@rempel-privat.de> <o.rempel@pengutronix.de>
|
||||
Oleksij Rempel <linux@rempel-privat.de> <ore@pengutronix.de>
|
||||
Oliver Upton <oliver.upton@linux.dev> <oupton@google.com>
|
||||
Pali Rohár <pali@kernel.org> <pali.rohar@gmail.com>
|
||||
Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it>
|
||||
Patrick Mochel <mochel@digitalimplant.org>
|
||||
|
||||
12
.rustfmt.toml
Normal file
12
.rustfmt.toml
Normal file
@@ -0,0 +1,12 @@
|
||||
edition = "2021"
|
||||
newline_style = "Unix"
|
||||
|
||||
# Unstable options that help catching some mistakes in formatting and that we may want to enable
|
||||
# when they become stable.
|
||||
#
|
||||
# They are kept here since they are useful to run from time to time.
|
||||
#format_code_in_doc_comments = true
|
||||
#reorder_impl_items = true
|
||||
#comment_width = 100
|
||||
#wrap_comments = true
|
||||
#normalize_comments = true
|
||||
@@ -227,6 +227,17 @@ Contact: dmaengine@vger.kernel.org
|
||||
Description: Indicate the number of retires for an enqcmds submission on a sharedwq.
|
||||
A max value to set attribute is capped at 64.
|
||||
|
||||
What: /sys/bus/dsa/devices/wq<m>.<n>/op_config
|
||||
Date: Sept 14, 2022
|
||||
KernelVersion: 6.0.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: Shows the operation capability bits displayed in bitmap format
|
||||
presented by %*pb printk() output format specifier.
|
||||
The attribute can be configured when the WQ is disabled in
|
||||
order to configure the WQ to accept specific bits that
|
||||
correlates to the operations allowed. It's visible only
|
||||
on platforms that support the capability.
|
||||
|
||||
What: /sys/bus/dsa/devices/engine<m>.<n>/group_id
|
||||
Date: Oct 25, 2019
|
||||
KernelVersion: 5.6.0
|
||||
@@ -255,3 +266,27 @@ Contact: dmaengine@vger.kernel.org
|
||||
Description: Indicates the number of Read Buffers reserved for the use of
|
||||
engines in the group. See DSA spec v1.2 9.2.18 GRPCFG Read Buffers
|
||||
Reserved.
|
||||
|
||||
What: /sys/bus/dsa/devices/group<m>.<n>/desc_progress_limit
|
||||
Date: Sept 14, 2022
|
||||
KernelVersion: 6.0.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: Allows control of the number of work descriptors that can be
|
||||
concurrently processed by an engine in the group as a fraction
|
||||
of the Maximum Work Descriptors in Progress value specified in
|
||||
the ENGCAP register. The acceptable values are 0 (default),
|
||||
1 (1/2 of max value), 2 (1/4 of the max value), and 3 (1/8 of
|
||||
the max value). It's visible only on platforms that support
|
||||
the capability.
|
||||
|
||||
What: /sys/bus/dsa/devices/group<m>.<n>/batch_progress_limit
|
||||
Date: Sept 14, 2022
|
||||
KernelVersion: 6.0.0
|
||||
Contact: dmaengine@vger.kernel.org
|
||||
Description: Allows control of the number of batch descriptors that can be
|
||||
concurrently processed by an engine in the group as a fraction
|
||||
of the Maximum Batch Descriptors in Progress value specified in
|
||||
the ENGCAP register. The acceptable values are 0 (default),
|
||||
1 (1/2 of max value), 2 (1/4 of the max value), and 3 (1/8 of
|
||||
the max value). It's visible only on platforms that support
|
||||
the capability.
|
||||
|
||||
@@ -54,3 +54,25 @@ Description:
|
||||
this feature.
|
||||
|
||||
Output will be in the format: "0x%08x\n".
|
||||
|
||||
What: /sys/kernel/debug/<cros-ec-device>/suspend_timeout_ms
|
||||
Date: August 2022
|
||||
KernelVersion: 6.1
|
||||
Description:
|
||||
Some ECs have a feature where they will track transitions of
|
||||
a hardware-controlled sleep line, such as Intel's SLP_S0 line,
|
||||
in order to detect cases where a system failed to go into deep
|
||||
sleep states. The suspend_timeout_ms file controls the amount of
|
||||
time in milliseconds the EC will wait before declaring a sleep
|
||||
timeout event and attempting to wake the system.
|
||||
|
||||
Supply 0 to use the default value coded into EC firmware. Supply
|
||||
65535 (EC_HOST_SLEEP_TIMEOUT_INFINITE) to disable the EC sleep
|
||||
failure detection mechanism. Values in between 0 and 65535
|
||||
indicate the number of milliseconds the EC should wait after a
|
||||
sleep transition before declaring a timeout. This includes both
|
||||
the duration after a sleep command was received but before the
|
||||
hardware line changed, as well as the duration between when the
|
||||
hardware line changed and the kernel sent an EC resume command.
|
||||
|
||||
Output will be in the format: "%u\n".
|
||||
|
||||
13
Documentation/ABI/testing/sysfs-amd-pmc
Normal file
13
Documentation/ABI/testing/sysfs-amd-pmc
Normal file
@@ -0,0 +1,13 @@
|
||||
What: /sys/bus/platform/drivers/amd_pmc/*/smu_fw_version
|
||||
Date: October 2022
|
||||
Contact: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Description: Reading this file reports the version of the firmware loaded to
|
||||
System Management Unit (SMU) contained in AMD CPUs and
|
||||
APUs.
|
||||
|
||||
What: /sys/bus/platform/drivers/amd_pmc/*/smu_program
|
||||
Date: October 2022
|
||||
Contact: Mario Limonciello <mario.limonciello@amd.com>
|
||||
Description: Reading this file reports the program corresponding to the SMU
|
||||
firmware version. The program field is used to disambiguate two
|
||||
APU/CPU models that can share the same firmware binary.
|
||||
13
Documentation/ABI/testing/sysfs-amd-pmf
Normal file
13
Documentation/ABI/testing/sysfs-amd-pmf
Normal file
@@ -0,0 +1,13 @@
|
||||
What: /sys/devices/platform/*/cnqf_enable
|
||||
Date: September 2022
|
||||
Contact: Shyam Sundar S K <Shyam-sundar.S-k@amd.com>
|
||||
Description: Reading this file tells if the AMD Platform Management(PMF)
|
||||
Cool n Quiet Framework(CnQF) feature is enabled or not.
|
||||
|
||||
This feature is not enabled by default and gets only turned on
|
||||
if OEM BIOS passes a "flag" to PMF ACPI function (index 11 or 12)
|
||||
or in case the user writes "on".
|
||||
|
||||
To turn off CnQF user can write "off" to the sysfs node.
|
||||
Note: Systems that support auto mode will not have this sysfs file
|
||||
available.
|
||||
@@ -3,7 +3,7 @@ Date: May 2011
|
||||
KernelVersion: 3.0
|
||||
Contact: Rafał Miłecki <zajec5@gmail.com>
|
||||
Description:
|
||||
Each BCMA core has it's manufacturer id. See
|
||||
Each BCMA core has its manufacturer id. See
|
||||
include/linux/bcma/bcma.h for possible values.
|
||||
|
||||
What: /sys/bus/bcma/devices/.../id
|
||||
|
||||
@@ -516,3 +516,11 @@ Contact: Mathieu Poirier <mathieu.poirier@linaro.org>
|
||||
Description: (Read) Returns the number of special conditional P1 right-hand keys
|
||||
that the trace unit can use (0x194). The value is taken
|
||||
directly from the HW.
|
||||
|
||||
What: /sys/bus/coresight/devices/etm<N>/ts_source
|
||||
Date: October 2022
|
||||
KernelVersion: 6.1
|
||||
Contact: Mathieu Poirier <mathieu.poirier@linaro.org> or Suzuki K Poulose <suzuki.poulose@arm.com>
|
||||
Description: (Read) When FEAT_TRF is implemented, value of TRFCR_ELx.TS used for
|
||||
trace session. Otherwise -1 indicates an unknown time source. Check
|
||||
trcidr0.tssize to see if a global timestamp is available.
|
||||
|
||||
@@ -4,6 +4,12 @@ Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Count data of Count Y represented as a string.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/capture
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Historical capture of the Count Y count data.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/ceiling
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
@@ -203,6 +209,13 @@ Description:
|
||||
both edges:
|
||||
Any state transition.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/num_overflows
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
This attribute indicates the number of overflows of count Y.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/countY/capture_component_id
|
||||
What: /sys/bus/counter/devices/counterX/countY/ceiling_component_id
|
||||
What: /sys/bus/counter/devices/counterX/countY/floor_component_id
|
||||
What: /sys/bus/counter/devices/counterX/countY/count_mode_component_id
|
||||
@@ -213,11 +226,14 @@ What: /sys/bus/counter/devices/counterX/countY/prescaler_component_id
|
||||
What: /sys/bus/counter/devices/counterX/countY/preset_component_id
|
||||
What: /sys/bus/counter/devices/counterX/countY/preset_enable_component_id
|
||||
What: /sys/bus/counter/devices/counterX/countY/signalZ_action_component_id
|
||||
What: /sys/bus/counter/devices/counterX/countY/num_overflows_component_id
|
||||
What: /sys/bus/counter/devices/counterX/signalY/cable_fault_component_id
|
||||
What: /sys/bus/counter/devices/counterX/signalY/cable_fault_enable_component_id
|
||||
What: /sys/bus/counter/devices/counterX/signalY/filter_clock_prescaler_component_id
|
||||
What: /sys/bus/counter/devices/counterX/signalY/index_polarity_component_id
|
||||
What: /sys/bus/counter/devices/counterX/signalY/polarity_component_id
|
||||
What: /sys/bus/counter/devices/counterX/signalY/synchronous_mode_component_id
|
||||
What: /sys/bus/counter/devices/counterX/signalY/frequency_component_id
|
||||
KernelVersion: 5.16
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
@@ -303,6 +319,19 @@ Description:
|
||||
Discrete set of available values for the respective Signal Y
|
||||
configuration are listed in this file.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/signalY/polarity
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Active level of Signal Y. The following polarity values are
|
||||
available:
|
||||
|
||||
positive:
|
||||
Signal high state considered active level (rising edge).
|
||||
|
||||
negative:
|
||||
Signal low state considered active level (falling edge).
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/signalY/name
|
||||
KernelVersion: 5.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
@@ -345,3 +374,9 @@ Description:
|
||||
via index_polarity. The index function (as enabled via
|
||||
preset_enable) is performed synchronously with the
|
||||
quadrature clock on the active level of the index input.
|
||||
|
||||
What: /sys/bus/counter/devices/counterX/signalY/frequency
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Read-only attribute that indicates the signal Y frequency, in Hz.
|
||||
|
||||
@@ -31,7 +31,7 @@ Description: 'FCoE Controller' instances on the fcoe bus.
|
||||
1) Write interface name to ctlr_create 2) Configure the FCoE
|
||||
Controller (ctlr_X) 3) Enable the FCoE Controller to begin
|
||||
discovery and login. The FCoE Controller is destroyed by
|
||||
writing it's name, i.e. ctlr_X to the ctlr_delete file.
|
||||
writing its name, i.e. ctlr_X to the ctlr_delete file.
|
||||
|
||||
Attributes:
|
||||
|
||||
|
||||
@@ -196,7 +196,7 @@ Description:
|
||||
Raw capacitance measurement from channel Y. Units after
|
||||
application of scale and offset are nanofarads.
|
||||
|
||||
What: /sys/.../iio:deviceX/in_capacitanceY-in_capacitanceZ_raw
|
||||
What: /sys/.../iio:deviceX/in_capacitanceY-capacitanceZ_raw
|
||||
KernelVersion: 3.2
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
@@ -207,6 +207,25 @@ Description:
|
||||
is required is a consistent labeling. Units after application
|
||||
of scale and offset are nanofarads.
|
||||
|
||||
What: /sys/.../iio:deviceX/in_capacitanceY-capacitanceZ_zeropoint
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
For differential channels, this an offset that is applied
|
||||
equally to both inputs. As the reading is of the difference
|
||||
between the two inputs, this should not be applied to the _raw
|
||||
reading by userspace (unlike _offset) and unlike calibbias
|
||||
it does not affect the differential value measured because
|
||||
the effect of _zeropoint cancels out across the two inputs
|
||||
that make up the differential pair. It's purpose is to bring
|
||||
the individual signals, before the differential is measured,
|
||||
within the measurement range of the device. The naming is
|
||||
chosen because if the separate inputs that make the
|
||||
differential pair are drawn on a graph in their
|
||||
_raw units, this is the value that the zero point on the
|
||||
measurement axis represents. It is expressed with the
|
||||
same scaling as _raw.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_temp_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_tempX_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_temp_x_raw
|
||||
@@ -241,6 +260,15 @@ Description:
|
||||
Has all of the equivalent parameters as per voltageY. Units
|
||||
after application of scale and offset are m/s^2.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_accel_linear_x_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_accel_linear_y_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_accel_linear_z_raw
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
As per in_accel_X_raw attributes, but minus the
|
||||
acceleration due to gravity.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_gravity_x_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_gravity_y_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_gravity_z_raw
|
||||
@@ -2038,3 +2066,99 @@ Description:
|
||||
Available range for the forced calibration value, expressed as:
|
||||
|
||||
- a range specified as "[min step max]"
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_voltageX_sampling_frequency
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_powerY_sampling_frequency
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_currentZ_sampling_frequency
|
||||
KernelVersion: 5.20
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Some devices have separate controls of sampling frequency for
|
||||
individual channels. If multiple channels are enabled in a scan,
|
||||
then the sampling_frequency of the scan may be computed from the
|
||||
per channel sampling frequencies.
|
||||
|
||||
What: /sys/.../events/in_accel_gesture_singletap_en
|
||||
What: /sys/.../events/in_accel_gesture_doubletap_en
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Device generates an event on a single or double tap.
|
||||
|
||||
What: /sys/.../events/in_accel_gesture_singletap_value
|
||||
What: /sys/.../events/in_accel_gesture_doubletap_value
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Specifies the threshold value that the device is comparing
|
||||
against to generate the tap gesture event. The lower
|
||||
threshold value increases the sensitivity of tap detection.
|
||||
Units and the exact meaning of value are device-specific.
|
||||
|
||||
What: /sys/.../events/in_accel_gesture_tap_value_available
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Lists all available threshold values which can be used to
|
||||
modify the sensitivity of the tap detection.
|
||||
|
||||
What: /sys/.../events/in_accel_gesture_singletap_reset_timeout
|
||||
What: /sys/.../events/in_accel_gesture_doubletap_reset_timeout
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Specifies the timeout value in seconds for the tap detector
|
||||
to not to look for another tap event after the event as
|
||||
occurred. Basically the minimum quiet time between the two
|
||||
single-tap's or two double-tap's.
|
||||
|
||||
What: /sys/.../events/in_accel_gesture_tap_reset_timeout_available
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Lists all available tap reset timeout values. Units in seconds.
|
||||
|
||||
What: /sys/.../events/in_accel_gesture_doubletap_tap2_min_delay
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Specifies the minimum quiet time in seconds between the two
|
||||
taps of a double tap.
|
||||
|
||||
What: /sys/.../events/in_accel_gesture_doubletap_tap2_min_delay_available
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Lists all available delay values between two taps in the double
|
||||
tap. Units in seconds.
|
||||
|
||||
What: /sys/.../events/in_accel_gesture_tap_maxtomin_time
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Specifies the maximum time difference allowed between upper
|
||||
and lower peak of tap to consider it as the valid tap event.
|
||||
Units in seconds.
|
||||
|
||||
What: /sys/.../events/in_accel_gesture_tap_maxtomin_time_available
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Lists all available time values between upper peak to lower
|
||||
peak. Units in seconds.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_rot_yaw_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_rot_pitch_raw
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_rot_roll_raw
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Raw (unscaled) euler angles readings. Units after
|
||||
application of scale are deg.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/serialnumber
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
An example format is 16-bytes, 2-digits-per-byte, HEX-string
|
||||
representing the sensor unique ID number.
|
||||
|
||||
81
Documentation/ABI/testing/sysfs-bus-iio-bno055
Normal file
81
Documentation/ABI/testing/sysfs-bus-iio-bno055
Normal file
@@ -0,0 +1,81 @@
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_accel_raw_range
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Raw (unscaled) range for acceleration readings. Unit after
|
||||
application of scale is m/s^2. Note that this doesn't affects
|
||||
the scale (which should be used when changing the maximum and
|
||||
minimum readable value affects also the reading scaling factor).
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_anglvel_raw_range
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Range for angular velocity readings in radians per second. Note
|
||||
that this does not affects the scale (which should be used when
|
||||
changing the maximum and minimum readable value affects also the
|
||||
reading scaling factor).
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_accel_raw_range_available
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
List of allowed values for in_accel_raw_range attribute
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_anglvel_raw_range_available
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
List of allowed values for in_anglvel_raw_range attribute
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_magn_calibration_fast_enable
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Can be 1 or 0. Enables/disables the "Fast Magnetometer
|
||||
Calibration" HW function.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/fusion_enable
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Can be 1 or 0. Enables/disables the "sensor fusion" (a.k.a.
|
||||
NDOF) HW function.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/calibration_data
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Reports the binary calibration data blob for the IMU sensors.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_accel_calibration_auto_status
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Reports the autocalibration status for the accelerometer sensor.
|
||||
Can be 0 (calibration non even enabled) or 1 to 5 where the greater
|
||||
the number, the better the calibration status.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_gyro_calibration_auto_status
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Reports the autocalibration status for the gyroscope sensor.
|
||||
Can be 0 (calibration non even enabled) or 1 to 5 where the greater
|
||||
the number, the better the calibration status.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/in_magn_calibration_auto_status
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Reports the autocalibration status for the magnetometer sensor.
|
||||
Can be 0 (calibration non even enabled) or 1 to 5 where the greater
|
||||
the number, the better the calibration status.
|
||||
|
||||
What: /sys/bus/iio/devices/iio:deviceX/sys_calibration_auto_status
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Reports the status for the IMU overall autocalibration.
|
||||
Can be 0 (calibration non even enabled) or 1 to 5 where the greater
|
||||
the number, the better the calibration status.
|
||||
11
Documentation/ABI/testing/sysfs-bus-iio-cdc-ad7746
Normal file
11
Documentation/ABI/testing/sysfs-bus-iio-cdc-ad7746
Normal file
@@ -0,0 +1,11 @@
|
||||
What: /sys/.../iio:deviceX/in_capacitableY_calibbias_calibration
|
||||
What: /sys/.../iio:deviceX/in_capacitableY_calibscale_calibration
|
||||
KernelVersion: 6.1
|
||||
Contact: linux-iio@vger.kernel.org
|
||||
Description:
|
||||
Write 1 to trigger a calibration of the calibbias or
|
||||
calibscale. For calibscale, a full scale capacitance should
|
||||
be connected to the capacitance input and a
|
||||
calibscale_calibration then started. For calibbias see
|
||||
the device datasheet section on "capacitive system offset
|
||||
calibration".
|
||||
@@ -18,7 +18,7 @@ Description:
|
||||
on the signal from which time of flight measurements are
|
||||
taken.
|
||||
The appropriate values to take is dependent on both the
|
||||
sensor and it's operating environment:
|
||||
sensor and its operating environment:
|
||||
* as3935 (0-31 range)
|
||||
18 = indoors (default)
|
||||
14 = outdoors
|
||||
|
||||
@@ -457,3 +457,36 @@ Description:
|
||||
|
||||
The file is writable if the PF is bound to a driver that
|
||||
implements ->sriov_set_msix_vec_count().
|
||||
|
||||
What: /sys/bus/pci/devices/.../resourceN_resize
|
||||
Date: September 2022
|
||||
Contact: Alex Williamson <alex.williamson@redhat.com>
|
||||
Description:
|
||||
These files provide an interface to PCIe Resizable BAR support.
|
||||
A file is created for each BAR resource (N) supported by the
|
||||
PCIe Resizable BAR extended capability of the device. Reading
|
||||
each file exposes the bitmap of available resource sizes:
|
||||
|
||||
# cat resource1_resize
|
||||
00000000000001c0
|
||||
|
||||
The bitmap represents supported resource sizes for the BAR,
|
||||
where bit0 = 1MB, bit1 = 2MB, bit2 = 4MB, etc. In the above
|
||||
example the device supports 64MB, 128MB, and 256MB BAR sizes.
|
||||
|
||||
When writing the file, the user provides the bit position of
|
||||
the desired resource size, for example:
|
||||
|
||||
# echo 7 > resource1_resize
|
||||
|
||||
This indicates to set the size value corresponding to bit 7,
|
||||
128MB. The resulting size is 2 ^ (bit# + 20). This definition
|
||||
matches the PCIe specification of this capability.
|
||||
|
||||
In order to make use of resource resizing, all PCI drivers must
|
||||
be unbound from the device and peer devices under the same
|
||||
parent bridge may need to be soft removed. In the case of
|
||||
VGA devices, writing a resize value will remove low level
|
||||
console drivers from the device. Raw users of pci-sysfs
|
||||
resourceN attributes must be terminated prior to resizing.
|
||||
Success of the resizing operation is not guaranteed.
|
||||
|
||||
@@ -153,7 +153,7 @@ Date: Jan 2020
|
||||
KernelVersion: 5.5
|
||||
Contact: Mika Westerberg <mika.westerberg@linux.intel.com>
|
||||
Description: This attribute reports number of RX lanes the device is
|
||||
using simultaneusly through its upstream port.
|
||||
using simultaneously through its upstream port.
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../tx_speed
|
||||
Date: Jan 2020
|
||||
@@ -167,7 +167,7 @@ Date: Jan 2020
|
||||
KernelVersion: 5.5
|
||||
Contact: Mika Westerberg <mika.westerberg@linux.intel.com>
|
||||
Description: This attribute reports number of TX lanes the device is
|
||||
using simultaneusly through its upstream port.
|
||||
using simultaneously through its upstream port.
|
||||
|
||||
What: /sys/bus/thunderbolt/devices/.../vendor
|
||||
Date: Sep 2017
|
||||
|
||||
@@ -364,7 +364,10 @@ Date: April 2019
|
||||
Contact: linux-pm@vger.kernel.org
|
||||
Description:
|
||||
Represents a battery percentage level, above which charging will
|
||||
stop.
|
||||
stop. Not all hardware is capable of setting this to an arbitrary
|
||||
percentage. Drivers will round written values to the nearest
|
||||
supported value. Reading back the value will show the actual
|
||||
threshold set by the driver.
|
||||
|
||||
Access: Read, Write
|
||||
|
||||
|
||||
61
Documentation/ABI/testing/sysfs-devices-hisi_ptt
Normal file
61
Documentation/ABI/testing/sysfs-devices-hisi_ptt
Normal file
@@ -0,0 +1,61 @@
|
||||
What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune
|
||||
Date: October 2022
|
||||
KernelVersion: 6.1
|
||||
Contact: Yicong Yang <yangyicong@hisilicon.com>
|
||||
Description: This directory contains files for tuning the PCIe link
|
||||
parameters(events). Each file is named after the event
|
||||
of the PCIe link.
|
||||
|
||||
See Documentation/trace/hisi-ptt.rst for more information.
|
||||
|
||||
What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/qos_tx_cpl
|
||||
Date: October 2022
|
||||
KernelVersion: 6.1
|
||||
Contact: Yicong Yang <yangyicong@hisilicon.com>
|
||||
Description: (RW) Controls the weight of Tx completion TLPs, which influence
|
||||
the proportion of outbound completion TLPs on the PCIe link.
|
||||
The available tune data is [0, 1, 2]. Writing a negative value
|
||||
will return an error, and out of range values will be converted
|
||||
to 2. The value indicates a probable level of the event.
|
||||
|
||||
What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/qos_tx_np
|
||||
Date: October 2022
|
||||
KernelVersion: 6.1
|
||||
Contact: Yicong Yang <yangyicong@hisilicon.com>
|
||||
Description: (RW) Controls the weight of Tx non-posted TLPs, which influence
|
||||
the proportion of outbound non-posted TLPs on the PCIe link.
|
||||
The available tune data is [0, 1, 2]. Writing a negative value
|
||||
will return an error, and out of range values will be converted
|
||||
to 2. The value indicates a probable level of the event.
|
||||
|
||||
What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/qos_tx_p
|
||||
Date: October 2022
|
||||
KernelVersion: 6.1
|
||||
Contact: Yicong Yang <yangyicong@hisilicon.com>
|
||||
Description: (RW) Controls the weight of Tx posted TLPs, which influence the
|
||||
proportion of outbound posted TLPs on the PCIe link.
|
||||
The available tune data is [0, 1, 2]. Writing a negative value
|
||||
will return an error, and out of range values will be converted
|
||||
to 2. The value indicates a probable level of the event.
|
||||
|
||||
What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/rx_alloc_buf_level
|
||||
Date: October 2022
|
||||
KernelVersion: 6.1
|
||||
Contact: Yicong Yang <yangyicong@hisilicon.com>
|
||||
Description: (RW) Control the allocated buffer watermark for inbound packets.
|
||||
The packets will be stored in the buffer first and then transmitted
|
||||
either when the watermark reached or when timed out.
|
||||
The available tune data is [0, 1, 2]. Writing a negative value
|
||||
will return an error, and out of range values will be converted
|
||||
to 2. The value indicates a probable level of the event.
|
||||
|
||||
What: /sys/devices/hisi_ptt<sicl_id>_<core_id>/tune/tx_alloc_buf_level
|
||||
Date: October 2022
|
||||
KernelVersion: 6.1
|
||||
Contact: Yicong Yang <yangyicong@hisilicon.com>
|
||||
Description: (RW) Control the allocated buffer watermark of outbound packets.
|
||||
The packets will be stored in the buffer first and then transmitted
|
||||
either when the watermark reached or when timed out.
|
||||
The available tune data is [0, 1, 2]. Writing a negative value
|
||||
will return an error, and out of range values will be converted
|
||||
to 2. The value indicates a probable level of the event.
|
||||
@@ -296,7 +296,7 @@ Description: Processor frequency boosting control
|
||||
|
||||
This switch controls the boost setting for the whole system.
|
||||
Boosting allows the CPU and the firmware to run at a frequency
|
||||
beyond it's nominal limit.
|
||||
beyond its nominal limit.
|
||||
|
||||
More details can be found in
|
||||
Documentation/admin-guide/pm/cpufreq.rst
|
||||
@@ -523,6 +523,7 @@ What: /sys/devices/system/cpu/vulnerabilities
|
||||
/sys/devices/system/cpu/vulnerabilities/tsx_async_abort
|
||||
/sys/devices/system/cpu/vulnerabilities/itlb_multihit
|
||||
/sys/devices/system/cpu/vulnerabilities/mmio_stale_data
|
||||
/sys/devices/system/cpu/vulnerabilities/retbleed
|
||||
Date: January 2018
|
||||
Contact: Linux kernel mailing list <linux-kernel@vger.kernel.org>
|
||||
Description: Information about CPU vulnerabilities
|
||||
|
||||
8
Documentation/ABI/testing/sysfs-devices-vfio-dev
Normal file
8
Documentation/ABI/testing/sysfs-devices-vfio-dev
Normal file
@@ -0,0 +1,8 @@
|
||||
What: /sys/.../<device>/vfio-dev/vfioX/
|
||||
Date: September 2022
|
||||
Contact: Yi Liu <yi.l.liu@intel.com>
|
||||
Description:
|
||||
This directory is created when the device is bound to a
|
||||
vfio driver. The layout under this directory matches what
|
||||
exists for a standard 'struct device'. 'X' is a unique
|
||||
index marking this device in vfio.
|
||||
@@ -16,7 +16,7 @@ Description: Version of the application running on the device's CPU
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/clk_max_freq_mhz
|
||||
Date: Jun 2019
|
||||
KernelVersion: not yet upstreamed
|
||||
KernelVersion: 5.7
|
||||
Contact: ogabbay@kernel.org
|
||||
Description: Allows the user to set the maximum clock frequency, in MHz.
|
||||
The device clock might be set to lower value than the maximum.
|
||||
@@ -26,7 +26,7 @@ Description: Allows the user to set the maximum clock frequency, in MHz.
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/clk_cur_freq_mhz
|
||||
Date: Jun 2019
|
||||
KernelVersion: not yet upstreamed
|
||||
KernelVersion: 5.7
|
||||
Contact: ogabbay@kernel.org
|
||||
Description: Displays the current frequency, in MHz, of the device clock.
|
||||
This property is valid only for the Gaudi ASIC family
|
||||
@@ -176,6 +176,12 @@ KernelVersion: 5.1
|
||||
Contact: ogabbay@kernel.org
|
||||
Description: Version of the device's preboot F/W code
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/security_enabled
|
||||
Date: Oct 2022
|
||||
KernelVersion: 6.1
|
||||
Contact: obitton@habana.ai
|
||||
Description: Displays the device's security status
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/soft_reset
|
||||
Date: Jan 2019
|
||||
KernelVersion: 5.1
|
||||
@@ -230,6 +236,6 @@ Description: Version of the u-boot running on the device's CPU
|
||||
|
||||
What: /sys/class/habanalabs/hl<n>/vrm_ver
|
||||
Date: Jan 2022
|
||||
KernelVersion: not yet upstreamed
|
||||
KernelVersion: 5.17
|
||||
Contact: ogabbay@kernel.org
|
||||
Description: Version of the Device's Voltage Regulator Monitor F/W code. N/A to GOYA and GAUDI
|
||||
|
||||
@@ -1417,6 +1417,15 @@ Description: This node is used to set or display whether UFS WriteBooster is
|
||||
platform that doesn't support UFSHCD_CAP_CLK_SCALING, we can
|
||||
disable/enable WriteBooster through this sysfs node.
|
||||
|
||||
What: /sys/bus/platform/drivers/ufshcd/*/enable_wb_buf_flush
|
||||
What: /sys/bus/platform/devices/*.ufs/enable_wb_buf_flush
|
||||
Date: July 2022
|
||||
Contact: Jinyoung Choi <j-young.choi@samsung.com>
|
||||
Description: This entry shows the status of WriteBooster buffer flushing
|
||||
and it can be used to enable or disable the flushing.
|
||||
If flushing is enabled, the device executes the flush
|
||||
operation when the command queue is empty.
|
||||
|
||||
What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/hpb_version
|
||||
What: /sys/bus/platform/devices/*.ufs/device_descriptor/hpb_version
|
||||
Date: June 2021
|
||||
@@ -1591,6 +1600,43 @@ Description: This entry shows the status of HPB.
|
||||
|
||||
The file is read only.
|
||||
|
||||
Contact: Daniil Lunev <dlunev@chromium.org>
|
||||
What: /sys/bus/platform/drivers/ufshcd/*/capabilities/
|
||||
What: /sys/bus/platform/devices/*.ufs/capabilities/
|
||||
Date: August 2022
|
||||
Description: The group represents the effective capabilities of the
|
||||
host-device pair. i.e. the capabilities which are enabled in the
|
||||
driver for the specific host controller, supported by the host
|
||||
controller and are supported and/or have compatible
|
||||
configuration on the device side.
|
||||
|
||||
Contact: Daniil Lunev <dlunev@chromium.org>
|
||||
What: /sys/bus/platform/drivers/ufshcd/*/capabilities/clock_scaling
|
||||
What: /sys/bus/platform/devices/*.ufs/capabilities/clock_scaling
|
||||
Date: August 2022
|
||||
Contact: Daniil Lunev <dlunev@chromium.org>
|
||||
Description: Indicates status of clock scaling.
|
||||
|
||||
== ============================
|
||||
0 Clock scaling is not supported.
|
||||
1 Clock scaling is supported.
|
||||
== ============================
|
||||
|
||||
The file is read only.
|
||||
|
||||
What: /sys/bus/platform/drivers/ufshcd/*/capabilities/write_booster
|
||||
What: /sys/bus/platform/devices/*.ufs/capabilities/write_booster
|
||||
Date: August 2022
|
||||
Contact: Daniil Lunev <dlunev@chromium.org>
|
||||
Description: Indicates status of Write Booster.
|
||||
|
||||
== ============================
|
||||
0 Write Booster can not be enabled.
|
||||
1 Write Booster can be enabled.
|
||||
== ============================
|
||||
|
||||
The file is read only.
|
||||
|
||||
What: /sys/class/scsi_device/*/device/hpb_param_sysfs/activation_thld
|
||||
Date: February 2021
|
||||
Contact: Avri Altman <avri.altman@wdc.com>
|
||||
|
||||
@@ -466,6 +466,30 @@ Description: Show status of f2fs superblock in real time.
|
||||
0x4000 SBI_IS_FREEZING freefs is in process
|
||||
====== ===================== =================================
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/stat/cp_status
|
||||
Date: September 2022
|
||||
Contact: "Chao Yu" <chao.yu@oppo.com>
|
||||
Description: Show status of f2fs checkpoint in real time.
|
||||
|
||||
=============================== ==============================
|
||||
cp flag value
|
||||
CP_UMOUNT_FLAG 0x00000001
|
||||
CP_ORPHAN_PRESENT_FLAG 0x00000002
|
||||
CP_COMPACT_SUM_FLAG 0x00000004
|
||||
CP_ERROR_FLAG 0x00000008
|
||||
CP_FSCK_FLAG 0x00000010
|
||||
CP_FASTBOOT_FLAG 0x00000020
|
||||
CP_CRC_RECOVERY_FLAG 0x00000040
|
||||
CP_NAT_BITS_FLAG 0x00000080
|
||||
CP_TRIMMED_FLAG 0x00000100
|
||||
CP_NOCRC_RECOVERY_FLAG 0x00000200
|
||||
CP_LARGE_NAT_BITMAP_FLAG 0x00000400
|
||||
CP_QUOTA_NEED_FSCK_FLAG 0x00000800
|
||||
CP_DISABLED_FLAG 0x00001000
|
||||
CP_DISABLED_QUICK_FLAG 0x00002000
|
||||
CP_RESIZEFS_FLAG 0x00004000
|
||||
=============================== ==============================
|
||||
|
||||
What: /sys/fs/f2fs/<disk>/ckpt_thread_ioprio
|
||||
Date: January 2021
|
||||
Contact: "Daeho Jeong" <daehojeong@google.com>
|
||||
|
||||
@@ -55,6 +55,14 @@ Description:
|
||||
The object directory contains subdirectories for each function
|
||||
that is patched within the object.
|
||||
|
||||
What: /sys/kernel/livepatch/<patch>/<object>/patched
|
||||
Date: August 2022
|
||||
KernelVersion: 6.1.0
|
||||
Contact: live-patching@vger.kernel.org
|
||||
Description:
|
||||
An attribute which indicates whether the object is currently
|
||||
patched.
|
||||
|
||||
What: /sys/kernel/livepatch/<patch>/<object>/<function,sympos>
|
||||
Date: Nov 2014
|
||||
KernelVersion: 3.19.0
|
||||
|
||||
25
Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
Normal file
25
Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers
Normal file
@@ -0,0 +1,25 @@
|
||||
What: /sys/devices/virtual/memory_tiering/
|
||||
Date: August 2022
|
||||
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||
Description: A collection of all the memory tiers allocated.
|
||||
|
||||
Individual memory tier details are contained in subdirectories
|
||||
named by the abstract distance of the memory tier.
|
||||
|
||||
/sys/devices/virtual/memory_tiering/memory_tierN/
|
||||
|
||||
|
||||
What: /sys/devices/virtual/memory_tiering/memory_tierN/
|
||||
/sys/devices/virtual/memory_tiering/memory_tierN/nodes
|
||||
Date: August 2022
|
||||
Contact: Linux memory management mailing list <linux-mm@kvack.org>
|
||||
Description: Directory with details of a specific memory tier
|
||||
|
||||
This is the directory containing information about a particular
|
||||
memory tier, memtierN, where N is derived based on abstract distance.
|
||||
|
||||
A smaller value of N implies a higher (faster) memory tier in the
|
||||
hierarchy.
|
||||
|
||||
nodes: NUMA nodes that are part of this memory tier.
|
||||
|
||||
@@ -57,3 +57,44 @@ Description:
|
||||
* 0 - default,
|
||||
* 1 - overboost,
|
||||
* 2 - silent
|
||||
|
||||
What: /sys/devices/platform/<platform>/gpu_mux_mode
|
||||
Date: Aug 2022
|
||||
KernelVersion: 6.1
|
||||
Contact: "Luke Jones" <luke@ljones.dev>
|
||||
Description:
|
||||
Switch the GPU hardware MUX mode. Laptops with this feature can
|
||||
can be toggled to boot with only the dGPU (discrete mode) or in
|
||||
standard Optimus/Hybrid mode. On switch a reboot is required:
|
||||
|
||||
* 0 - Discrete GPU,
|
||||
* 1 - Optimus/Hybrid,
|
||||
|
||||
What: /sys/devices/platform/<platform>/dgpu_disable
|
||||
Date: Aug 2022
|
||||
KernelVersion: 5.17
|
||||
Contact: "Luke Jones" <luke@ljones.dev>
|
||||
Description:
|
||||
Disable discrete GPU:
|
||||
* 0 - Enable dGPU,
|
||||
* 1 - Disable dGPU
|
||||
|
||||
What: /sys/devices/platform/<platform>/egpu_enable
|
||||
Date: Aug 2022
|
||||
KernelVersion: 5.17
|
||||
Contact: "Luke Jones" <luke@ljones.dev>
|
||||
Description:
|
||||
Enable the external GPU paired with ROG X-Flow laptops.
|
||||
Toggling this setting will also trigger ACPI to disable the dGPU:
|
||||
|
||||
* 0 - Disable,
|
||||
* 1 - Enable
|
||||
|
||||
What: /sys/devices/platform/<platform>/panel_od
|
||||
Date: Aug 2022
|
||||
KernelVersion: 5.17
|
||||
Contact: "Luke Jones" <luke@ljones.dev>
|
||||
Description:
|
||||
Enable an LCD response-time boost to reduce or remove ghosting:
|
||||
* 0 - Disable,
|
||||
* 1 - Enable
|
||||
|
||||
15
Documentation/ABI/testing/sysfs-platform-brcmstb-memc
Normal file
15
Documentation/ABI/testing/sysfs-platform-brcmstb-memc
Normal file
@@ -0,0 +1,15 @@
|
||||
What: /sys/bus/platform/devices/*/srpd
|
||||
Date: July 2022
|
||||
KernelVersion: 5.21
|
||||
Contact: Florian Fainelli <f.fainelli@gmail.com>
|
||||
Description:
|
||||
Self Refresh Power Down (SRPD) inactivity timeout counted in
|
||||
internal DDR controller clock cycles. Possible values range
|
||||
from 0 (disable inactivity timeout) to 65535 (0xffff).
|
||||
|
||||
What: /sys/bus/platform/devices/*/frequency
|
||||
Date: July 2022
|
||||
KernelVersion: 5.21
|
||||
Contact: Florian Fainelli <f.fainelli@gmail.com>
|
||||
Description:
|
||||
DDR PHY frequency in Hz.
|
||||
@@ -2,8 +2,8 @@ What: /sys/bus/platform/devices/ci_hdrc.0/role
|
||||
Date: Mar 2017
|
||||
Contact: Peter Chen <peter.chen@nxp.com>
|
||||
Description:
|
||||
It returns string "gadget" or "host" when read it, it indicates
|
||||
current controller role.
|
||||
When read, it returns string "gadget" or "host", indicating
|
||||
the current controller role.
|
||||
|
||||
It will do role switch when write "gadget" or "host" to it.
|
||||
It will do role switch when "gadget" or "host" is written to it.
|
||||
Only controller at dual-role configuration supports writing.
|
||||
|
||||
@@ -152,7 +152,7 @@ Description:
|
||||
case further investigation is required to determine which
|
||||
device is causing the problem. Note that genuine RTC clock
|
||||
values (such as when pm_trace has not been used), can still
|
||||
match a device and output it's name here.
|
||||
match a device and output its name here.
|
||||
|
||||
What: /sys/power/pm_async
|
||||
Date: January 2009
|
||||
|
||||
@@ -66,8 +66,13 @@ over a rather long period of time, but improvements are always welcome!
|
||||
As a rough rule of thumb, any dereference of an RCU-protected
|
||||
pointer must be covered by rcu_read_lock(), rcu_read_lock_bh(),
|
||||
rcu_read_lock_sched(), or by the appropriate update-side lock.
|
||||
Disabling of preemption can serve as rcu_read_lock_sched(), but
|
||||
is less readable and prevents lockdep from detecting locking issues.
|
||||
Explicit disabling of preemption (preempt_disable(), for example)
|
||||
can serve as rcu_read_lock_sched(), but is less readable and
|
||||
prevents lockdep from detecting locking issues.
|
||||
|
||||
Please not that you *cannot* rely on code known to be built
|
||||
only in non-preemptible kernels. Such code can and will break,
|
||||
especially in kernels built with CONFIG_PREEMPT_COUNT=y.
|
||||
|
||||
Letting RCU-protected pointers "leak" out of an RCU read-side
|
||||
critical section is every bit as bad as letting them leak out
|
||||
@@ -185,6 +190,9 @@ over a rather long period of time, but improvements are always welcome!
|
||||
|
||||
5. If call_rcu() or call_srcu() is used, the callback function will
|
||||
be called from softirq context. In particular, it cannot block.
|
||||
If you need the callback to block, run that code in a workqueue
|
||||
handler scheduled from the callback. The queue_rcu_work()
|
||||
function does this for you in the case of call_rcu().
|
||||
|
||||
6. Since synchronize_rcu() can block, it cannot be called
|
||||
from any sort of irq context. The same rule applies
|
||||
@@ -297,7 +305,8 @@ over a rather long period of time, but improvements are always welcome!
|
||||
the machine.
|
||||
|
||||
d. Periodically invoke synchronize_rcu(), permitting a limited
|
||||
number of updates per grace period.
|
||||
number of updates per grace period. Better yet, periodically
|
||||
invoke rcu_barrier() to wait for all outstanding callbacks.
|
||||
|
||||
The same cautions apply to call_srcu() and kfree_rcu().
|
||||
|
||||
@@ -477,6 +486,6 @@ over a rather long period of time, but improvements are always welcome!
|
||||
So if you need to wait for both an RCU grace period and for
|
||||
all pre-existing call_rcu() callbacks, you will need to execute
|
||||
both rcu_barrier() and synchronize_rcu(), if necessary, using
|
||||
something like workqueues to to execute them concurrently.
|
||||
something like workqueues to execute them concurrently.
|
||||
|
||||
See rcubarrier.rst for more information.
|
||||
|
||||
@@ -61,7 +61,7 @@ checking of rcu_dereference() primitives:
|
||||
rcu_access_pointer(p):
|
||||
Return the value of the pointer and omit all barriers,
|
||||
but retain the compiler constraints that prevent duplicating
|
||||
or coalescsing. This is useful when when testing the
|
||||
or coalescsing. This is useful when testing the
|
||||
value of the pointer itself, for example, against NULL.
|
||||
|
||||
The rcu_dereference_check() check expression can be any boolean
|
||||
|
||||
@@ -128,10 +128,16 @@ Follow these rules to keep your RCU code working properly:
|
||||
This sort of comparison occurs frequently when scanning
|
||||
RCU-protected circular linked lists.
|
||||
|
||||
Note that if checks for being within an RCU read-side
|
||||
critical section are not required and the pointer is never
|
||||
dereferenced, rcu_access_pointer() should be used in place
|
||||
of rcu_dereference().
|
||||
Note that if the pointer comparison is done outside
|
||||
of an RCU read-side critical section, and the pointer
|
||||
is never dereferenced, rcu_access_pointer() should be
|
||||
used in place of rcu_dereference(). In most cases,
|
||||
it is best to avoid accidental dereferences by testing
|
||||
the rcu_access_pointer() return value directly, without
|
||||
assigning it to a variable.
|
||||
|
||||
Within an RCU read-side critical section, there is little
|
||||
reason to use rcu_access_pointer().
|
||||
|
||||
- The comparison is against a pointer that references memory
|
||||
that was initialized "a long time ago." The reason
|
||||
|
||||
@@ -6,13 +6,15 @@ What is RCU? -- "Read, Copy, Update"
|
||||
Please note that the "What is RCU?" LWN series is an excellent place
|
||||
to start learning about RCU:
|
||||
|
||||
| 1. What is RCU, Fundamentally? http://lwn.net/Articles/262464/
|
||||
| 2. What is RCU? Part 2: Usage http://lwn.net/Articles/263130/
|
||||
| 3. RCU part 3: the RCU API http://lwn.net/Articles/264090/
|
||||
| 4. The RCU API, 2010 Edition http://lwn.net/Articles/418853/
|
||||
| 2010 Big API Table http://lwn.net/Articles/419086/
|
||||
| 5. The RCU API, 2014 Edition http://lwn.net/Articles/609904/
|
||||
| 2014 Big API Table http://lwn.net/Articles/609973/
|
||||
| 1. What is RCU, Fundamentally? https://lwn.net/Articles/262464/
|
||||
| 2. What is RCU? Part 2: Usage https://lwn.net/Articles/263130/
|
||||
| 3. RCU part 3: the RCU API https://lwn.net/Articles/264090/
|
||||
| 4. The RCU API, 2010 Edition https://lwn.net/Articles/418853/
|
||||
| 2010 Big API Table https://lwn.net/Articles/419086/
|
||||
| 5. The RCU API, 2014 Edition https://lwn.net/Articles/609904/
|
||||
| 2014 Big API Table https://lwn.net/Articles/609973/
|
||||
| 6. The RCU API, 2019 Edition https://lwn.net/Articles/777036/
|
||||
| 2019 Big API Table https://lwn.net/Articles/777165/
|
||||
|
||||
|
||||
What is RCU?
|
||||
@@ -915,13 +917,18 @@ which an RCU reference is held include:
|
||||
The understanding that RCU provides a reference that only prevents a
|
||||
change of type is particularly visible with objects allocated from a
|
||||
slab cache marked ``SLAB_TYPESAFE_BY_RCU``. RCU operations may yield a
|
||||
reference to an object from such a cache that has been concurrently
|
||||
freed and the memory reallocated to a completely different object,
|
||||
though of the same type. In this case RCU doesn't even protect the
|
||||
identity of the object from changing, only its type. So the object
|
||||
found may not be the one expected, but it will be one where it is safe
|
||||
to take a reference or spinlock and then confirm that the identity
|
||||
matches the expectations.
|
||||
reference to an object from such a cache that has been concurrently freed
|
||||
and the memory reallocated to a completely different object, though of
|
||||
the same type. In this case RCU doesn't even protect the identity of the
|
||||
object from changing, only its type. So the object found may not be the
|
||||
one expected, but it will be one where it is safe to take a reference
|
||||
(and then potentially acquiring a spinlock), allowing subsequent code
|
||||
to check whether the identity matches expectations. It is tempting
|
||||
to simply acquire the spinlock without first taking the reference, but
|
||||
unfortunately any spinlock in a ``SLAB_TYPESAFE_BY_RCU`` object must be
|
||||
initialized after each and every call to kmem_cache_alloc(), which renders
|
||||
reference-free spinlock acquisition completely unsafe. Therefore, when
|
||||
using ``SLAB_TYPESAFE_BY_RCU``, make proper use of a reference counter.
|
||||
|
||||
With traditional reference counting -- such as that implemented by the
|
||||
kref library in Linux -- there is typically code that runs when the last
|
||||
@@ -1057,14 +1064,20 @@ SRCU: Initialization/cleanup::
|
||||
init_srcu_struct
|
||||
cleanup_srcu_struct
|
||||
|
||||
All: lockdep-checked RCU-protected pointer access::
|
||||
All: lockdep-checked RCU utility APIs::
|
||||
|
||||
rcu_access_pointer
|
||||
rcu_dereference_raw
|
||||
RCU_LOCKDEP_WARN
|
||||
rcu_sleep_check
|
||||
RCU_NONIDLE
|
||||
|
||||
All: Unchecked RCU-protected pointer access::
|
||||
|
||||
rcu_dereference_raw
|
||||
|
||||
All: Unchecked RCU-protected pointer access with dereferencing prohibited::
|
||||
|
||||
rcu_access_pointer
|
||||
|
||||
See the comment headers in the source code (or the docbook generated
|
||||
from them) for more information.
|
||||
|
||||
|
||||
@@ -13,7 +13,7 @@ a) waiting for a CPU (while being runnable)
|
||||
b) completion of synchronous block I/O initiated by the task
|
||||
c) swapping in pages
|
||||
d) memory reclaim
|
||||
e) thrashing page cache
|
||||
e) thrashing
|
||||
f) direct compact
|
||||
g) write-protect copy
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
.. _readme:
|
||||
|
||||
Linux kernel release 5.x <http://kernel.org/>
|
||||
Linux kernel release 6.x <http://kernel.org/>
|
||||
=============================================
|
||||
|
||||
These are the release notes for Linux version 5. Read them carefully,
|
||||
These are the release notes for Linux version 6. Read them carefully,
|
||||
as they tell you what this is all about, explain how to install the
|
||||
kernel, and what to do if something goes wrong.
|
||||
|
||||
@@ -63,7 +63,7 @@ Installing the kernel source
|
||||
directory where you have permissions (e.g. your home directory) and
|
||||
unpack it::
|
||||
|
||||
xz -cd linux-5.x.tar.xz | tar xvf -
|
||||
xz -cd linux-6.x.tar.xz | tar xvf -
|
||||
|
||||
Replace "X" with the version number of the latest kernel.
|
||||
|
||||
@@ -72,12 +72,12 @@ Installing the kernel source
|
||||
files. They should match the library, and not get messed up by
|
||||
whatever the kernel-du-jour happens to be.
|
||||
|
||||
- You can also upgrade between 5.x releases by patching. Patches are
|
||||
- You can also upgrade between 6.x releases by patching. Patches are
|
||||
distributed in the xz format. To install by patching, get all the
|
||||
newer patch files, enter the top level directory of the kernel source
|
||||
(linux-5.x) and execute::
|
||||
(linux-6.x) and execute::
|
||||
|
||||
xz -cd ../patch-5.x.xz | patch -p1
|
||||
xz -cd ../patch-6.x.xz | patch -p1
|
||||
|
||||
Replace "x" for all versions bigger than the version "x" of your current
|
||||
source tree, **in_order**, and you should be ok. You may want to remove
|
||||
@@ -85,13 +85,13 @@ Installing the kernel source
|
||||
that there are no failed patches (some-file-name# or some-file-name.rej).
|
||||
If there are, either you or I have made a mistake.
|
||||
|
||||
Unlike patches for the 5.x kernels, patches for the 5.x.y kernels
|
||||
Unlike patches for the 6.x kernels, patches for the 6.x.y kernels
|
||||
(also known as the -stable kernels) are not incremental but instead apply
|
||||
directly to the base 5.x kernel. For example, if your base kernel is 5.0
|
||||
and you want to apply the 5.0.3 patch, you must not first apply the 5.0.1
|
||||
and 5.0.2 patches. Similarly, if you are running kernel version 5.0.2 and
|
||||
want to jump to 5.0.3, you must first reverse the 5.0.2 patch (that is,
|
||||
patch -R) **before** applying the 5.0.3 patch. You can read more on this in
|
||||
directly to the base 6.x kernel. For example, if your base kernel is 6.0
|
||||
and you want to apply the 6.0.3 patch, you must not first apply the 6.0.1
|
||||
and 6.0.2 patches. Similarly, if you are running kernel version 6.0.2 and
|
||||
want to jump to 6.0.3, you must first reverse the 6.0.2 patch (that is,
|
||||
patch -R) **before** applying the 6.0.3 patch. You can read more on this in
|
||||
:ref:`Documentation/process/applying-patches.rst <applying_patches>`.
|
||||
|
||||
Alternatively, the script patch-kernel can be used to automate this
|
||||
@@ -114,7 +114,7 @@ Installing the kernel source
|
||||
Software requirements
|
||||
---------------------
|
||||
|
||||
Compiling and running the 5.x kernels requires up-to-date
|
||||
Compiling and running the 6.x kernels requires up-to-date
|
||||
versions of various software packages. Consult
|
||||
:ref:`Documentation/process/changes.rst <changes>` for the minimum version numbers
|
||||
required and how to get updates for these packages. Beware that using
|
||||
@@ -132,12 +132,12 @@ Build directory for the kernel
|
||||
place for the output files (including .config).
|
||||
Example::
|
||||
|
||||
kernel source code: /usr/src/linux-5.x
|
||||
kernel source code: /usr/src/linux-6.x
|
||||
build directory: /home/name/build/kernel
|
||||
|
||||
To configure and build the kernel, use::
|
||||
|
||||
cd /usr/src/linux-5.x
|
||||
cd /usr/src/linux-6.x
|
||||
make O=/home/name/build/kernel menuconfig
|
||||
make O=/home/name/build/kernel
|
||||
sudo make O=/home/name/build/kernel modules_install install
|
||||
@@ -262,8 +262,6 @@ Compiling the kernel
|
||||
- Make sure you have at least gcc 5.1 available.
|
||||
For more information, refer to :ref:`Documentation/process/changes.rst <changes>`.
|
||||
|
||||
Please note that you can still run a.out user programs with this kernel.
|
||||
|
||||
- Do a ``make`` to create a compressed kernel image. It is also
|
||||
possible to do ``make install`` if you have lilo installed to suit the
|
||||
kernel makefiles, but you may want to check your particular lilo setup first.
|
||||
@@ -332,85 +330,10 @@ Compiling the kernel
|
||||
If something goes wrong
|
||||
-----------------------
|
||||
|
||||
- If you have problems that seem to be due to kernel bugs, please check
|
||||
the file MAINTAINERS to see if there is a particular person associated
|
||||
with the part of the kernel that you are having trouble with. If there
|
||||
isn't anyone listed there, then the second best thing is to mail
|
||||
them to me (torvalds@linux-foundation.org), and possibly to any other
|
||||
relevant mailing-list or to the newsgroup.
|
||||
If you have problems that seem to be due to kernel bugs, please follow the
|
||||
instructions at 'Documentation/admin-guide/reporting-issues.rst'.
|
||||
|
||||
- In all bug-reports, *please* tell what kernel you are talking about,
|
||||
how to duplicate the problem, and what your setup is (use your common
|
||||
sense). If the problem is new, tell me so, and if the problem is
|
||||
old, please try to tell me when you first noticed it.
|
||||
|
||||
- If the bug results in a message like::
|
||||
|
||||
unable to handle kernel paging request at address C0000010
|
||||
Oops: 0002
|
||||
EIP: 0010:XXXXXXXX
|
||||
eax: xxxxxxxx ebx: xxxxxxxx ecx: xxxxxxxx edx: xxxxxxxx
|
||||
esi: xxxxxxxx edi: xxxxxxxx ebp: xxxxxxxx
|
||||
ds: xxxx es: xxxx fs: xxxx gs: xxxx
|
||||
Pid: xx, process nr: xx
|
||||
xx xx xx xx xx xx xx xx xx xx
|
||||
|
||||
or similar kernel debugging information on your screen or in your
|
||||
system log, please duplicate it *exactly*. The dump may look
|
||||
incomprehensible to you, but it does contain information that may
|
||||
help debugging the problem. The text above the dump is also
|
||||
important: it tells something about why the kernel dumped code (in
|
||||
the above example, it's due to a bad kernel pointer). More information
|
||||
on making sense of the dump is in Documentation/admin-guide/bug-hunting.rst
|
||||
|
||||
- If you compiled the kernel with CONFIG_KALLSYMS you can send the dump
|
||||
as is, otherwise you will have to use the ``ksymoops`` program to make
|
||||
sense of the dump (but compiling with CONFIG_KALLSYMS is usually preferred).
|
||||
This utility can be downloaded from
|
||||
https://www.kernel.org/pub/linux/utils/kernel/ksymoops/ .
|
||||
Alternatively, you can do the dump lookup by hand:
|
||||
|
||||
- In debugging dumps like the above, it helps enormously if you can
|
||||
look up what the EIP value means. The hex value as such doesn't help
|
||||
me or anybody else very much: it will depend on your particular
|
||||
kernel setup. What you should do is take the hex value from the EIP
|
||||
line (ignore the ``0010:``), and look it up in the kernel namelist to
|
||||
see which kernel function contains the offending address.
|
||||
|
||||
To find out the kernel function name, you'll need to find the system
|
||||
binary associated with the kernel that exhibited the symptom. This is
|
||||
the file 'linux/vmlinux'. To extract the namelist and match it against
|
||||
the EIP from the kernel crash, do::
|
||||
|
||||
nm vmlinux | sort | less
|
||||
|
||||
This will give you a list of kernel addresses sorted in ascending
|
||||
order, from which it is simple to find the function that contains the
|
||||
offending address. Note that the address given by the kernel
|
||||
debugging messages will not necessarily match exactly with the
|
||||
function addresses (in fact, that is very unlikely), so you can't
|
||||
just 'grep' the list: the list will, however, give you the starting
|
||||
point of each kernel function, so by looking for the function that
|
||||
has a starting address lower than the one you are searching for but
|
||||
is followed by a function with a higher address you will find the one
|
||||
you want. In fact, it may be a good idea to include a bit of
|
||||
"context" in your problem report, giving a few lines around the
|
||||
interesting one.
|
||||
|
||||
If you for some reason cannot do the above (you have a pre-compiled
|
||||
kernel image or similar), telling me as much about your setup as
|
||||
possible will help. Please read
|
||||
'Documentation/admin-guide/reporting-issues.rst' for details.
|
||||
|
||||
- Alternatively, you can use gdb on a running kernel. (read-only; i.e. you
|
||||
cannot change values or set break points.) To do this, first compile the
|
||||
kernel with -g; edit arch/x86/Makefile appropriately, then do a ``make
|
||||
clean``. You'll also need to enable CONFIG_PROC_FS (via ``make config``).
|
||||
|
||||
After you've rebooted with the new kernel, do ``gdb vmlinux /proc/kcore``.
|
||||
You can now use all the usual gdb commands. The command to look up the
|
||||
point where your system crashed is ``l *0xXXXXXXXX``. (Replace the XXXes
|
||||
with the EIP value.)
|
||||
|
||||
gdb'ing a non-running kernel currently fails because ``gdb`` (wrongly)
|
||||
disregards the starting offset for which the kernel is compiled.
|
||||
Hints on understanding kernel bug reports are in
|
||||
'Documentation/admin-guide/bug-hunting.rst'. More on debugging the kernel
|
||||
with gdb is in 'Documentation/dev-tools/gdb-kernel-debugging.rst' and
|
||||
'Documentation/dev-tools/kgdb.rst'.
|
||||
|
||||
@@ -1,13 +0,0 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===============
|
||||
Overriding DSDT
|
||||
===============
|
||||
|
||||
Linux supports a method of overriding the BIOS DSDT:
|
||||
|
||||
CONFIG_ACPI_CUSTOM_DSDT - builds the image into the kernel.
|
||||
|
||||
When to use this method is described in detail on the
|
||||
Linux/ACPI home page:
|
||||
https://01.org/linux-acpi/documentation/overriding-dsdt
|
||||
@@ -299,7 +299,7 @@ Per-node-per-memcgroup LRU (cgroup's private LRU) is guarded by
|
||||
lruvec->lru_lock; PG_lru bit of page->flags is cleared before
|
||||
isolating a page from its LRU under lruvec->lru_lock.
|
||||
|
||||
2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)
|
||||
2.7 Kernel Memory Extension
|
||||
-----------------------------------------------
|
||||
|
||||
With the Kernel memory extension, the Memory Controller is able to limit
|
||||
@@ -386,8 +386,6 @@ U != 0, K >= U:
|
||||
|
||||
a. Enable CONFIG_CGROUPS
|
||||
b. Enable CONFIG_MEMCG
|
||||
c. Enable CONFIG_MEMCG_SWAP (to use swap extension)
|
||||
d. Enable CONFIG_MEMCG_KMEM (to use kmem extension)
|
||||
|
||||
3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?)
|
||||
-------------------------------------------------------------------
|
||||
|
||||
@@ -976,6 +976,29 @@ All cgroup core files are prefixed with "cgroup."
|
||||
killing cgroups is a process directed operation, i.e. it affects
|
||||
the whole thread-group.
|
||||
|
||||
cgroup.pressure
|
||||
A read-write single value file that allowed values are "0" and "1".
|
||||
The default is "1".
|
||||
|
||||
Writing "0" to the file will disable the cgroup PSI accounting.
|
||||
Writing "1" to the file will re-enable the cgroup PSI accounting.
|
||||
|
||||
This control attribute is not hierarchical, so disable or enable PSI
|
||||
accounting in a cgroup does not affect PSI accounting in descendants
|
||||
and doesn't need pass enablement via ancestors from root.
|
||||
|
||||
The reason this control attribute exists is that PSI accounts stalls for
|
||||
each cgroup separately and aggregates it at each level of the hierarchy.
|
||||
This may cause non-negligible overhead for some workloads when under
|
||||
deep level of the hierarchy, in which case this control attribute can
|
||||
be used to disable PSI accounting in the non-leaf cgroups.
|
||||
|
||||
irq.pressure
|
||||
A read-write nested-keyed file.
|
||||
|
||||
Shows pressure stall information for IRQ/SOFTIRQ. See
|
||||
:ref:`Documentation/accounting/psi.rst <psi>` for details.
|
||||
|
||||
Controllers
|
||||
===========
|
||||
|
||||
@@ -1355,6 +1378,11 @@ PAGE_SIZE multiple when read back.
|
||||
pagetables
|
||||
Amount of memory allocated for page tables.
|
||||
|
||||
sec_pagetables
|
||||
Amount of memory allocated for secondary page tables,
|
||||
this currently includes KVM mmu allocations on x86
|
||||
and arm64.
|
||||
|
||||
percpu (npn)
|
||||
Amount of memory used for storing per-cpu kernel
|
||||
data structures.
|
||||
@@ -2185,75 +2213,93 @@ Cpuset Interface Files
|
||||
|
||||
It accepts only the following input values when written to.
|
||||
|
||||
======== ================================
|
||||
"root" a partition root
|
||||
"member" a non-root member of a partition
|
||||
======== ================================
|
||||
========== =====================================
|
||||
"member" Non-root member of a partition
|
||||
"root" Partition root
|
||||
"isolated" Partition root without load balancing
|
||||
========== =====================================
|
||||
|
||||
When set to be a partition root, the current cgroup is the
|
||||
root of a new partition or scheduling domain that comprises
|
||||
itself and all its descendants except those that are separate
|
||||
partition roots themselves and their descendants. The root
|
||||
cgroup is always a partition root.
|
||||
The root cgroup is always a partition root and its state
|
||||
cannot be changed. All other non-root cgroups start out as
|
||||
"member".
|
||||
|
||||
There are constraints on where a partition root can be set.
|
||||
It can only be set in a cgroup if all the following conditions
|
||||
are true.
|
||||
When set to "root", the current cgroup is the root of a new
|
||||
partition or scheduling domain that comprises itself and all
|
||||
its descendants except those that are separate partition roots
|
||||
themselves and their descendants.
|
||||
|
||||
1) The "cpuset.cpus" is not empty and the list of CPUs are
|
||||
exclusive, i.e. they are not shared by any of its siblings.
|
||||
2) The parent cgroup is a partition root.
|
||||
3) The "cpuset.cpus" is also a proper subset of the parent's
|
||||
"cpuset.cpus.effective".
|
||||
4) There is no child cgroups with cpuset enabled. This is for
|
||||
eliminating corner cases that have to be handled if such a
|
||||
condition is allowed.
|
||||
When set to "isolated", the CPUs in that partition root will
|
||||
be in an isolated state without any load balancing from the
|
||||
scheduler. Tasks placed in such a partition with multiple
|
||||
CPUs should be carefully distributed and bound to each of the
|
||||
individual CPUs for optimal performance.
|
||||
|
||||
Setting it to partition root will take the CPUs away from the
|
||||
effective CPUs of the parent cgroup. Once it is set, this
|
||||
file cannot be reverted back to "member" if there are any child
|
||||
cgroups with cpuset enabled.
|
||||
The value shown in "cpuset.cpus.effective" of a partition root
|
||||
is the CPUs that the partition root can dedicate to a potential
|
||||
new child partition root. The new child subtracts available
|
||||
CPUs from its parent "cpuset.cpus.effective".
|
||||
|
||||
A parent partition cannot distribute all its CPUs to its
|
||||
child partitions. There must be at least one cpu left in the
|
||||
parent partition.
|
||||
A partition root ("root" or "isolated") can be in one of the
|
||||
two possible states - valid or invalid. An invalid partition
|
||||
root is in a degraded state where some state information may
|
||||
be retained, but behaves more like a "member".
|
||||
|
||||
Once becoming a partition root, changes to "cpuset.cpus" is
|
||||
generally allowed as long as the first condition above is true,
|
||||
the change will not take away all the CPUs from the parent
|
||||
partition and the new "cpuset.cpus" value is a superset of its
|
||||
children's "cpuset.cpus" values.
|
||||
All possible state transitions among "member", "root" and
|
||||
"isolated" are allowed.
|
||||
|
||||
Sometimes, external factors like changes to ancestors'
|
||||
"cpuset.cpus" or cpu hotplug can cause the state of the partition
|
||||
root to change. On read, the "cpuset.sched.partition" file
|
||||
can show the following values.
|
||||
On read, the "cpuset.cpus.partition" file can show the following
|
||||
values.
|
||||
|
||||
============== ==============================
|
||||
"member" Non-root member of a partition
|
||||
"root" Partition root
|
||||
"root invalid" Invalid partition root
|
||||
============== ==============================
|
||||
============================= =====================================
|
||||
"member" Non-root member of a partition
|
||||
"root" Partition root
|
||||
"isolated" Partition root without load balancing
|
||||
"root invalid (<reason>)" Invalid partition root
|
||||
"isolated invalid (<reason>)" Invalid isolated partition root
|
||||
============================= =====================================
|
||||
|
||||
It is a partition root if the first 2 partition root conditions
|
||||
above are true and at least one CPU from "cpuset.cpus" is
|
||||
granted by the parent cgroup.
|
||||
In the case of an invalid partition root, a descriptive string on
|
||||
why the partition is invalid is included within parentheses.
|
||||
|
||||
A partition root can become invalid if none of CPUs requested
|
||||
in "cpuset.cpus" can be granted by the parent cgroup or the
|
||||
parent cgroup is no longer a partition root itself. In this
|
||||
case, it is not a real partition even though the restriction
|
||||
of the first partition root condition above will still apply.
|
||||
The cpu affinity of all the tasks in the cgroup will then be
|
||||
associated with CPUs in the nearest ancestor partition.
|
||||
For a partition root to become valid, the following conditions
|
||||
must be met.
|
||||
|
||||
An invalid partition root can be transitioned back to a
|
||||
real partition root if at least one of the requested CPUs
|
||||
can now be granted by its parent. In this case, the cpu
|
||||
affinity of all the tasks in the formerly invalid partition
|
||||
will be associated to the CPUs of the newly formed partition.
|
||||
Changing the partition state of an invalid partition root to
|
||||
"member" is always allowed even if child cpusets are present.
|
||||
1) The "cpuset.cpus" is exclusive with its siblings , i.e. they
|
||||
are not shared by any of its siblings (exclusivity rule).
|
||||
2) The parent cgroup is a valid partition root.
|
||||
3) The "cpuset.cpus" is not empty and must contain at least
|
||||
one of the CPUs from parent's "cpuset.cpus", i.e. they overlap.
|
||||
4) The "cpuset.cpus.effective" cannot be empty unless there is
|
||||
no task associated with this partition.
|
||||
|
||||
External events like hotplug or changes to "cpuset.cpus" can
|
||||
cause a valid partition root to become invalid and vice versa.
|
||||
Note that a task cannot be moved to a cgroup with empty
|
||||
"cpuset.cpus.effective".
|
||||
|
||||
For a valid partition root with the sibling cpu exclusivity
|
||||
rule enabled, changes made to "cpuset.cpus" that violate the
|
||||
exclusivity rule will invalidate the partition as well as its
|
||||
sibiling partitions with conflicting cpuset.cpus values. So
|
||||
care must be taking in changing "cpuset.cpus".
|
||||
|
||||
A valid non-root parent partition may distribute out all its CPUs
|
||||
to its child partitions when there is no task associated with it.
|
||||
|
||||
Care must be taken to change a valid partition root to
|
||||
"member" as all its child partitions, if present, will become
|
||||
invalid causing disruption to tasks running in those child
|
||||
partitions. These inactivated partitions could be recovered if
|
||||
their parent is switched back to a partition root with a proper
|
||||
set of "cpuset.cpus".
|
||||
|
||||
Poll and inotify events are triggered whenever the state of
|
||||
"cpuset.cpus.partition" changes. That includes changes caused
|
||||
by write to "cpuset.cpus.partition", cpu hotplug or other
|
||||
changes that modify the validity status of the partition.
|
||||
This will allow user space agents to monitor unexpected changes
|
||||
to "cpuset.cpus.partition" without the need to do continuous
|
||||
polling.
|
||||
|
||||
|
||||
Device controller
|
||||
|
||||
@@ -5,143 +5,115 @@ Dynamic debug
|
||||
Introduction
|
||||
============
|
||||
|
||||
This document describes how to use the dynamic debug (dyndbg) feature.
|
||||
Dynamic debug allows you to dynamically enable/disable kernel
|
||||
debug-print code to obtain additional kernel information.
|
||||
|
||||
Dynamic debug is designed to allow you to dynamically enable/disable
|
||||
kernel code to obtain additional kernel information. Currently, if
|
||||
``CONFIG_DYNAMIC_DEBUG`` is set, then all ``pr_debug()``/``dev_dbg()`` and
|
||||
``print_hex_dump_debug()``/``print_hex_dump_bytes()`` calls can be dynamically
|
||||
enabled per-callsite.
|
||||
If ``/proc/dynamic_debug/control`` exists, your kernel has dynamic
|
||||
debug. You'll need root access (sudo su) to use this.
|
||||
|
||||
If you do not want to enable dynamic debug globally (i.e. in some embedded
|
||||
system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic
|
||||
debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any
|
||||
modules which you'd like to dynamically debug later.
|
||||
Dynamic debug provides:
|
||||
|
||||
If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is just
|
||||
shortcut for ``print_hex_dump(KERN_DEBUG)``.
|
||||
* a Catalog of all *prdbgs* in your kernel.
|
||||
``cat /proc/dynamic_debug/control`` to see them.
|
||||
|
||||
For ``print_hex_dump_debug()``/``print_hex_dump_bytes()``, format string is
|
||||
its ``prefix_str`` argument, if it is constant string; or ``hexdump``
|
||||
in case ``prefix_str`` is built dynamically.
|
||||
|
||||
Dynamic debug has even more useful features:
|
||||
|
||||
* Simple query language allows turning on and off debugging
|
||||
statements by matching any combination of 0 or 1 of:
|
||||
* a Simple query/command language to alter *prdbgs* by selecting on
|
||||
any combination of 0 or 1 of:
|
||||
|
||||
- source filename
|
||||
- function name
|
||||
- line number (including ranges of line numbers)
|
||||
- module name
|
||||
- format string
|
||||
|
||||
* Provides a debugfs control file: ``<debugfs>/dynamic_debug/control``
|
||||
which can be read to display the complete list of known debug
|
||||
statements, to help guide you
|
||||
|
||||
Controlling dynamic debug Behaviour
|
||||
===================================
|
||||
|
||||
The behaviour of ``pr_debug()``/``dev_dbg()`` are controlled via writing to a
|
||||
control file in the 'debugfs' filesystem. Thus, you must first mount
|
||||
the debugfs filesystem, in order to make use of this feature.
|
||||
Subsequently, we refer to the control file as:
|
||||
``<debugfs>/dynamic_debug/control``. For example, if you want to enable
|
||||
printing from source file ``svcsock.c``, line 1603 you simply do::
|
||||
|
||||
nullarbor:~ # echo 'file svcsock.c line 1603 +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
|
||||
If you make a mistake with the syntax, the write will fail thus::
|
||||
|
||||
nullarbor:~ # echo 'file svcsock.c wtf 1 +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
-bash: echo: write error: Invalid argument
|
||||
|
||||
Note, for systems without 'debugfs' enabled, the control file can be
|
||||
found in ``/proc/dynamic_debug/control``.
|
||||
- class name (as known/declared by each module)
|
||||
|
||||
Viewing Dynamic Debug Behaviour
|
||||
===============================
|
||||
|
||||
You can view the currently configured behaviour of all the debug
|
||||
statements via::
|
||||
You can view the currently configured behaviour in the *prdbg* catalog::
|
||||
|
||||
nullarbor:~ # cat <debugfs>/dynamic_debug/control
|
||||
:#> head -n7 /proc/dynamic_debug/control
|
||||
# filename:lineno [module]function flags format
|
||||
net/sunrpc/svc_rdma.c:323 [svcxprt_rdma]svc_rdma_cleanup =_ "SVCRDMA Module Removed, deregister RPC RDMA transport\012"
|
||||
net/sunrpc/svc_rdma.c:341 [svcxprt_rdma]svc_rdma_init =_ "\011max_inline : %d\012"
|
||||
net/sunrpc/svc_rdma.c:340 [svcxprt_rdma]svc_rdma_init =_ "\011sq_depth : %d\012"
|
||||
net/sunrpc/svc_rdma.c:338 [svcxprt_rdma]svc_rdma_init =_ "\011max_requests : %d\012"
|
||||
...
|
||||
init/main.c:1179 [main]initcall_blacklist =_ "blacklisting initcall %s\012
|
||||
init/main.c:1218 [main]initcall_blacklisted =_ "initcall %s blacklisted\012"
|
||||
init/main.c:1424 [main]run_init_process =_ " with arguments:\012"
|
||||
init/main.c:1426 [main]run_init_process =_ " %s\012"
|
||||
init/main.c:1427 [main]run_init_process =_ " with environment:\012"
|
||||
init/main.c:1429 [main]run_init_process =_ " %s\012"
|
||||
|
||||
The 3rd space-delimited column shows the current flags, preceded by
|
||||
a ``=`` for easy use with grep/cut. ``=p`` shows enabled callsites.
|
||||
|
||||
You can also apply standard Unix text manipulation filters to this
|
||||
data, e.g.::
|
||||
Controlling dynamic debug Behaviour
|
||||
===================================
|
||||
|
||||
nullarbor:~ # grep -i rdma <debugfs>/dynamic_debug/control | wc -l
|
||||
62
|
||||
The behaviour of *prdbg* sites are controlled by writing
|
||||
query/commands to the control file. Example::
|
||||
|
||||
nullarbor:~ # grep -i tcp <debugfs>/dynamic_debug/control | wc -l
|
||||
42
|
||||
# grease the interface
|
||||
:#> alias ddcmd='echo $* > /proc/dynamic_debug/control'
|
||||
|
||||
The third column shows the currently enabled flags for each debug
|
||||
statement callsite (see below for definitions of the flags). The
|
||||
default value, with no flags enabled, is ``=_``. So you can view all
|
||||
the debug statement callsites with any non-default flags::
|
||||
:#> ddcmd '-p; module main func run* +p'
|
||||
:#> grep =p /proc/dynamic_debug/control
|
||||
init/main.c:1424 [main]run_init_process =p " with arguments:\012"
|
||||
init/main.c:1426 [main]run_init_process =p " %s\012"
|
||||
init/main.c:1427 [main]run_init_process =p " with environment:\012"
|
||||
init/main.c:1429 [main]run_init_process =p " %s\012"
|
||||
|
||||
nullarbor:~ # awk '$3 != "=_"' <debugfs>/dynamic_debug/control
|
||||
# filename:lineno [module]function flags format
|
||||
net/sunrpc/svcsock.c:1603 [sunrpc]svc_send p "svc_process: st_sendto returned %d\012"
|
||||
Error messages go to console/syslog::
|
||||
|
||||
:#> ddcmd mode foo +p
|
||||
dyndbg: unknown keyword "mode"
|
||||
dyndbg: query parse failed
|
||||
bash: echo: write error: Invalid argument
|
||||
|
||||
If debugfs is also enabled and mounted, ``dynamic_debug/control`` is
|
||||
also under the mount-dir, typically ``/sys/kernel/debug/``.
|
||||
|
||||
Command Language Reference
|
||||
==========================
|
||||
|
||||
At the lexical level, a command comprises a sequence of words separated
|
||||
At the basic lexical level, a command is a sequence of words separated
|
||||
by spaces or tabs. So these are all equivalent::
|
||||
|
||||
nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
nullarbor:~ # echo -n ' file svcsock.c line 1603 +p ' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
:#> ddcmd file svcsock.c line 1603 +p
|
||||
:#> ddcmd "file svcsock.c line 1603 +p"
|
||||
:#> ddcmd ' file svcsock.c line 1603 +p '
|
||||
|
||||
Command submissions are bounded by a write() system call.
|
||||
Multiple commands can be written together, separated by ``;`` or ``\n``::
|
||||
|
||||
~# echo "func pnpacpi_get_resources +p; func pnp_assign_mem +p" \
|
||||
> <debugfs>/dynamic_debug/control
|
||||
:#> ddcmd "func pnpacpi_get_resources +p; func pnp_assign_mem +p"
|
||||
:#> ddcmd <<"EOC"
|
||||
func pnpacpi_get_resources +p
|
||||
func pnp_assign_mem +p
|
||||
EOC
|
||||
:#> cat query-batch-file > /proc/dynamic_debug/control
|
||||
|
||||
If your query set is big, you can batch them too::
|
||||
You can also use wildcards in each query term. The match rule supports
|
||||
``*`` (matches zero or more characters) and ``?`` (matches exactly one
|
||||
character). For example, you can match all usb drivers::
|
||||
|
||||
~# cat query-batch-file > <debugfs>/dynamic_debug/control
|
||||
:#> ddcmd file "drivers/usb/*" +p # "" to suppress shell expansion
|
||||
|
||||
Another way is to use wildcards. The match rule supports ``*`` (matches
|
||||
zero or more characters) and ``?`` (matches exactly one character). For
|
||||
example, you can match all usb drivers::
|
||||
|
||||
~# echo "file drivers/usb/* +p" > <debugfs>/dynamic_debug/control
|
||||
|
||||
At the syntactical level, a command comprises a sequence of match
|
||||
specifications, followed by a flags change specification::
|
||||
Syntactically, a command is pairs of keyword values, followed by a
|
||||
flags change or setting::
|
||||
|
||||
command ::= match-spec* flags-spec
|
||||
|
||||
The match-spec's are used to choose a subset of the known pr_debug()
|
||||
callsites to which to apply the flags-spec. Think of them as a query
|
||||
with implicit ANDs between each pair. Note that an empty list of
|
||||
match-specs will select all debug statement callsites.
|
||||
The match-spec's select *prdbgs* from the catalog, upon which to apply
|
||||
the flags-spec, all constraints are ANDed together. An absent keyword
|
||||
is the same as keyword "*".
|
||||
|
||||
A match specification comprises a keyword, which controls the
|
||||
attribute of the callsite to be compared, and a value to compare
|
||||
against. Possible keywords are:::
|
||||
|
||||
A match specification is a keyword, which selects the attribute of
|
||||
the callsite to be compared, and a value to compare against. Possible
|
||||
keywords are:::
|
||||
|
||||
match-spec ::= 'func' string |
|
||||
'file' string |
|
||||
'module' string |
|
||||
'format' string |
|
||||
'class' string |
|
||||
'line' line-range
|
||||
|
||||
line-range ::= lineno |
|
||||
@@ -203,6 +175,16 @@ format
|
||||
format "nfsd: SETATTR" // a neater way to match a format with whitespace
|
||||
format 'nfsd: SETATTR' // yet another way to match a format with whitespace
|
||||
|
||||
class
|
||||
The given class_name is validated against each module, which may
|
||||
have declared a list of known class_names. If the class_name is
|
||||
found for a module, callsite & class matching and adjustment
|
||||
proceeds. Examples::
|
||||
|
||||
class DRM_UT_KMS # a DRM.debug category
|
||||
class JUNK # silent non-match
|
||||
// class TLD_* # NOTICE: no wildcard in class names
|
||||
|
||||
line
|
||||
The given line number or range of line numbers is compared
|
||||
against the line number of each ``pr_debug()`` callsite. A single
|
||||
@@ -228,17 +210,16 @@ of the characters::
|
||||
The flags are::
|
||||
|
||||
p enables the pr_debug() callsite.
|
||||
f Include the function name in the printed message
|
||||
l Include line number in the printed message
|
||||
m Include module name in the printed message
|
||||
t Include thread ID in messages not generated from interrupt context
|
||||
_ No flags are set. (Or'd with others on input)
|
||||
_ enables no flags.
|
||||
|
||||
For ``print_hex_dump_debug()`` and ``print_hex_dump_bytes()``, only ``p`` flag
|
||||
have meaning, other flags ignored.
|
||||
Decorator flags add to the message-prefix, in order:
|
||||
t Include thread ID, or <intr>
|
||||
m Include module name
|
||||
f Include the function name
|
||||
l Include line number
|
||||
|
||||
For display, the flags are preceded by ``=``
|
||||
(mnemonic: what the flags are currently equal to).
|
||||
For ``print_hex_dump_debug()`` and ``print_hex_dump_bytes()``, only
|
||||
the ``p`` flag has meaning, other flags are ignored.
|
||||
|
||||
Note the regexp ``^[-+=][flmpt_]+$`` matches a flags specification.
|
||||
To clear all flags at once, use ``=_`` or ``-flmpt``.
|
||||
@@ -313,7 +294,7 @@ For ``CONFIG_DYNAMIC_DEBUG`` kernels, any settings given at boot-time (or
|
||||
enabled by ``-DDEBUG`` flag during compilation) can be disabled later via
|
||||
the debugfs interface if the debug messages are no longer needed::
|
||||
|
||||
echo "module module_name -p" > <debugfs>/dynamic_debug/control
|
||||
echo "module module_name -p" > /proc/dynamic_debug/control
|
||||
|
||||
Examples
|
||||
========
|
||||
@@ -321,37 +302,31 @@ Examples
|
||||
::
|
||||
|
||||
// enable the message at line 1603 of file svcsock.c
|
||||
nullarbor:~ # echo -n 'file svcsock.c line 1603 +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
:#> ddcmd 'file svcsock.c line 1603 +p'
|
||||
|
||||
// enable all the messages in file svcsock.c
|
||||
nullarbor:~ # echo -n 'file svcsock.c +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
:#> ddcmd 'file svcsock.c +p'
|
||||
|
||||
// enable all the messages in the NFS server module
|
||||
nullarbor:~ # echo -n 'module nfsd +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
:#> ddcmd 'module nfsd +p'
|
||||
|
||||
// enable all 12 messages in the function svc_process()
|
||||
nullarbor:~ # echo -n 'func svc_process +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
:#> ddcmd 'func svc_process +p'
|
||||
|
||||
// disable all 12 messages in the function svc_process()
|
||||
nullarbor:~ # echo -n 'func svc_process -p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
:#> ddcmd 'func svc_process -p'
|
||||
|
||||
// enable messages for NFS calls READ, READLINK, READDIR and READDIR+.
|
||||
nullarbor:~ # echo -n 'format "nfsd: READ" +p' >
|
||||
<debugfs>/dynamic_debug/control
|
||||
:#> ddcmd 'format "nfsd: READ" +p'
|
||||
|
||||
// enable messages in files of which the paths include string "usb"
|
||||
nullarbor:~ # echo -n 'file *usb* +p' > <debugfs>/dynamic_debug/control
|
||||
:#> ddcmd 'file *usb* +p' > /proc/dynamic_debug/control
|
||||
|
||||
// enable all messages
|
||||
nullarbor:~ # echo -n '+p' > <debugfs>/dynamic_debug/control
|
||||
:#> ddcmd '+p' > /proc/dynamic_debug/control
|
||||
|
||||
// add module, function to all enabled messages
|
||||
nullarbor:~ # echo -n '+mf' > <debugfs>/dynamic_debug/control
|
||||
:#> ddcmd '+mf' > /proc/dynamic_debug/control
|
||||
|
||||
// boot-args example, with newlines and comments for readability
|
||||
Kernel command line: ...
|
||||
@@ -364,3 +339,38 @@ Examples
|
||||
dyndbg="file init/* +p #cmt ; func parse_one +p"
|
||||
// enable pr_debugs in 2 functions in a module loaded later
|
||||
pc87360.dyndbg="func pc87360_init_device +p; func pc87360_find +p"
|
||||
|
||||
Kernel Configuration
|
||||
====================
|
||||
|
||||
Dynamic Debug is enabled via kernel config items::
|
||||
|
||||
CONFIG_DYNAMIC_DEBUG=y # build catalog, enables CORE
|
||||
CONFIG_DYNAMIC_DEBUG_CORE=y # enable mechanics only, skip catalog
|
||||
|
||||
If you do not want to enable dynamic debug globally (i.e. in some embedded
|
||||
system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic
|
||||
debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any
|
||||
modules which you'd like to dynamically debug later.
|
||||
|
||||
|
||||
Kernel *prdbg* API
|
||||
==================
|
||||
|
||||
The following functions are cataloged and controllable when dynamic
|
||||
debug is enabled::
|
||||
|
||||
pr_debug()
|
||||
dev_dbg()
|
||||
print_hex_dump_debug()
|
||||
print_hex_dump_bytes()
|
||||
|
||||
Otherwise, they are off by default; ``ccflags += -DDEBUG`` or
|
||||
``#define DEBUG`` in a source file will enable them appropriately.
|
||||
|
||||
If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is
|
||||
just a shortcut for ``print_hex_dump(KERN_DEBUG)``.
|
||||
|
||||
For ``print_hex_dump_debug()``/``print_hex_dump_bytes()``, format string is
|
||||
its ``prefix_str`` argument, if it is constant string; or ``hexdump``
|
||||
in case ``prefix_str`` is built dynamically.
|
||||
|
||||
@@ -230,6 +230,20 @@ The possible values in this file are:
|
||||
* - 'Mitigation: Clear CPU buffers'
|
||||
- The processor is vulnerable and the CPU buffer clearing mitigation is
|
||||
enabled.
|
||||
* - 'Unknown: No mitigations'
|
||||
- The processor vulnerability status is unknown because it is
|
||||
out of Servicing period. Mitigation is not attempted.
|
||||
|
||||
Definitions:
|
||||
------------
|
||||
|
||||
Servicing period: The process of providing functional and security updates to
|
||||
Intel processors or platforms, utilizing the Intel Platform Update (IPU)
|
||||
process or other similar mechanisms.
|
||||
|
||||
End of Servicing Updates (ESU): ESU is the date at which Intel will no
|
||||
longer provide Servicing, such as through IPU or other similar update
|
||||
processes. ESU dates will typically be aligned to end of quarter.
|
||||
|
||||
If the processor is vulnerable then the following information is appended to
|
||||
the above information:
|
||||
|
||||
@@ -613,6 +613,7 @@ kernel command line.
|
||||
eibrs enhanced IBRS
|
||||
eibrs,retpoline enhanced IBRS + Retpolines
|
||||
eibrs,lfence enhanced IBRS + LFENCE
|
||||
ibrs use IBRS to protect kernel
|
||||
|
||||
Not specifying this option is equivalent to
|
||||
spectre_v2=auto.
|
||||
|
||||
@@ -200,7 +200,7 @@ prb
|
||||
|
||||
A pointer to the printk ringbuffer (struct printk_ringbuffer). This
|
||||
may be pointing to the static boot ringbuffer or the dynamically
|
||||
allocated ringbuffer, depending on when the the core dump occurred.
|
||||
allocated ringbuffer, depending on when the core dump occurred.
|
||||
Used by user-space tools to read the active kernel log buffer.
|
||||
|
||||
printk_rb_static
|
||||
|
||||
@@ -321,6 +321,8 @@
|
||||
force_enable - Force enable the IOMMU on platforms known
|
||||
to be buggy with IOMMU enabled. Use this
|
||||
option with care.
|
||||
pgtbl_v1 - Use v1 page table for DMA-API (Default).
|
||||
pgtbl_v2 - Use v2 page table for DMA-API.
|
||||
|
||||
amd_iommu_dump= [HW,X86-64]
|
||||
Enable AMD IOMMU driver option to dump the ACPI table
|
||||
@@ -966,10 +968,6 @@
|
||||
|
||||
debugpat [X86] Enable PAT debugging
|
||||
|
||||
decnet.addr= [HW,NET]
|
||||
Format: <area>[,<node>]
|
||||
See also Documentation/networking/decnet.rst.
|
||||
|
||||
default_hugepagesz=
|
||||
[HW] The size of the default HugeTLB page. This is
|
||||
the size represented by the legacy /proc/ hugepages
|
||||
@@ -1471,6 +1469,14 @@
|
||||
Permit 'security.evm' to be updated regardless of
|
||||
current integrity status.
|
||||
|
||||
early_page_ext [KNL] Enforces page_ext initialization to earlier
|
||||
stages so cover more early boot allocations.
|
||||
Please note that as side effect some optimizations
|
||||
might be disabled to achieve that (e.g. parallelized
|
||||
memory initialization is disabled) so the boot process
|
||||
might take longer, especially on systems with a lot of
|
||||
memory. Available with CONFIG_PAGE_EXTENSION=y.
|
||||
|
||||
failslab=
|
||||
fail_usercopy=
|
||||
fail_page_alloc=
|
||||
@@ -2436,6 +2442,12 @@
|
||||
0: force disabled
|
||||
1: force enabled
|
||||
|
||||
kunit.enable= [KUNIT] Enable executing KUnit tests. Requires
|
||||
CONFIG_KUNIT to be set to be fully enabled. The
|
||||
default value can be overridden via
|
||||
KUNIT_DEFAULT_ENABLED.
|
||||
Default is 1 (enabled)
|
||||
|
||||
kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs.
|
||||
Default is 0 (don't ignore, but inject #GP)
|
||||
|
||||
@@ -3207,6 +3219,7 @@
|
||||
spectre_v2_user=off [X86]
|
||||
spec_store_bypass_disable=off [X86,PPC]
|
||||
ssbd=force-off [ARM64]
|
||||
nospectre_bhb [ARM64]
|
||||
l1tf=off [X86]
|
||||
mds=off [X86]
|
||||
tsx_async_abort=off [X86]
|
||||
@@ -3613,7 +3626,7 @@
|
||||
|
||||
nohugeiomap [KNL,X86,PPC,ARM64] Disable kernel huge I/O mappings.
|
||||
|
||||
nohugevmalloc [PPC] Disable kernel huge vmalloc mappings.
|
||||
nohugevmalloc [KNL,X86,PPC,ARM64] Disable kernel huge vmalloc mappings.
|
||||
|
||||
nosmt [KNL,S390] Disable symmetric multithreading (SMT).
|
||||
Equivalent to smt=1.
|
||||
@@ -3626,11 +3639,15 @@
|
||||
(bounds check bypass). With this option data leaks are
|
||||
possible in the system.
|
||||
|
||||
nospectre_v2 [X86,PPC_FSL_BOOK3E,ARM64] Disable all mitigations for
|
||||
nospectre_v2 [X86,PPC_E500,ARM64] Disable all mitigations for
|
||||
the Spectre variant 2 (indirect branch prediction)
|
||||
vulnerability. System may allow data leaks with this
|
||||
option.
|
||||
|
||||
nospectre_bhb [ARM64] Disable all mitigations for Spectre-BHB (branch
|
||||
history injection) vulnerability. System may allow data leaks
|
||||
with this option.
|
||||
|
||||
nospec_store_bypass_disable
|
||||
[HW] Disable all mitigations for the Speculative Store Bypass vulnerability
|
||||
|
||||
@@ -3741,9 +3758,9 @@
|
||||
[X86,PV_OPS] Disable paravirtualized VMware scheduler
|
||||
clock and use the default one.
|
||||
|
||||
no-steal-acc [X86,PV_OPS,ARM64] Disable paravirtualized steal time
|
||||
accounting. steal time is computed, but won't
|
||||
influence scheduler behaviour
|
||||
no-steal-acc [X86,PV_OPS,ARM64,PPC/PSERIES] Disable paravirtualized
|
||||
steal time accounting. steal time is computed, but
|
||||
won't influence scheduler behaviour
|
||||
|
||||
nolapic [X86-32,APIC] Do not enable or use the local APIC.
|
||||
|
||||
@@ -3805,6 +3822,10 @@
|
||||
|
||||
nox2apic [X86-64,APIC] Do not enable x2APIC mode.
|
||||
|
||||
NOTE: this parameter will be ignored on systems with the
|
||||
LEGACY_XAPIC_DISABLED bit set in the
|
||||
IA32_XAPIC_DISABLE_STATUS MSR.
|
||||
|
||||
nps_mtm_hs_ctr= [KNL,ARC]
|
||||
This parameter sets the maximum duration, in
|
||||
cycles, each HW thread of the CTOP can run
|
||||
@@ -5331,6 +5352,8 @@
|
||||
rodata= [KNL]
|
||||
on Mark read-only kernel memory as read-only (default).
|
||||
off Leave read-only kernel memory writable for debugging.
|
||||
full Mark read-only kernel memory and aliases as read-only
|
||||
[arm64]
|
||||
|
||||
rockchip.usb_uart
|
||||
Enable the uart passthrough on the designated usb port
|
||||
@@ -6026,12 +6049,6 @@
|
||||
This parameter controls use of the Protected
|
||||
Execution Facility on pSeries.
|
||||
|
||||
swapaccount= [KNL]
|
||||
Format: [0|1]
|
||||
Enable accounting of swap in memory resource
|
||||
controller if no parameter or 1 is given or disable
|
||||
it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst)
|
||||
|
||||
swiotlb= [ARM,IA-64,PPC,MIPS,X86]
|
||||
Format: { <int> [,<int>] | force | noforce }
|
||||
<int> -- Number of I/O TLB slabs
|
||||
@@ -6834,6 +6851,12 @@
|
||||
Crash from Xen panic notifier, without executing late
|
||||
panic() code such as dumping handler.
|
||||
|
||||
xen_msr_safe= [X86,XEN]
|
||||
Format: <bool>
|
||||
Select whether to always use non-faulting (safe) MSR
|
||||
access functions when running as Xen PV guest. The
|
||||
default value is controlled by CONFIG_XEN_PV_MSR_SAFE.
|
||||
|
||||
xen_nopvspin [X86,XEN]
|
||||
Disables the qspinlock slowpath using Xen PV optimizations.
|
||||
This parameter is obsoleted by "nopvspin" parameter, which
|
||||
|
||||
@@ -5,10 +5,10 @@ CMA Debugfs Interface
|
||||
The CMA debugfs interface is useful to retrieve basic information out of the
|
||||
different CMA areas and to test allocation/release in each of the areas.
|
||||
|
||||
Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
|
||||
kernel's CMA index. So the first CMA zone would be:
|
||||
Each CMA area represents a directory under <debugfs>/cma/, represented by
|
||||
its CMA name like below:
|
||||
|
||||
<debugfs>/cma/cma-0
|
||||
<debugfs>/cma/<cma_name>
|
||||
|
||||
The structure of the files created under that directory is as follows:
|
||||
|
||||
@@ -18,8 +18,8 @@ The structure of the files created under that directory is as follows:
|
||||
- [RO] bitmap: The bitmap of page states in the zone.
|
||||
- [WO] alloc: Allocate N pages from that CMA area. For example::
|
||||
|
||||
echo 5 > <debugfs>/cma/cma-2/alloc
|
||||
echo 5 > <debugfs>/cma/<cma_name>/alloc
|
||||
|
||||
would try to allocate 5 pages from the cma-2 area.
|
||||
would try to allocate 5 pages from the 'cma_name' area.
|
||||
|
||||
- [WO] free: Free N pages from that CMA area, similar to the above.
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
========================
|
||||
Monitoring Data Accesses
|
||||
========================
|
||||
==========================
|
||||
DAMON: Data Access MONitor
|
||||
==========================
|
||||
|
||||
:doc:`DAMON </mm/damon/index>` allows light-weight data access monitoring.
|
||||
Using DAMON, users can analyze the memory access patterns of their systems and
|
||||
|
||||
@@ -29,16 +29,9 @@ called DAMON Operator (DAMO). It is available at
|
||||
https://github.com/awslabs/damo. The examples below assume that ``damo`` is on
|
||||
your ``$PATH``. It's not mandatory, though.
|
||||
|
||||
Because DAMO is using the debugfs interface (refer to :doc:`usage` for the
|
||||
detail) of DAMON, you should ensure debugfs is mounted. Mount it manually as
|
||||
below::
|
||||
|
||||
# mount -t debugfs none /sys/kernel/debug/
|
||||
|
||||
or append the following line to your ``/etc/fstab`` file so that your system
|
||||
can automatically mount debugfs upon booting::
|
||||
|
||||
debugfs /sys/kernel/debug debugfs defaults 0 0
|
||||
Because DAMO is using the sysfs interface (refer to :doc:`usage` for the
|
||||
detail) of DAMON, you should ensure :doc:`sysfs </filesystems/sysfs>` is
|
||||
mounted.
|
||||
|
||||
|
||||
Recording Data Access Patterns
|
||||
|
||||
@@ -50,10 +50,10 @@ For a short example, users can monitor the virtual address space of a given
|
||||
workload as below. ::
|
||||
|
||||
# cd /sys/kernel/mm/damon/admin/
|
||||
# echo 1 > kdamonds/nr && echo 1 > kdamonds/0/contexts/nr
|
||||
# echo 1 > kdamonds/nr_kdamonds && echo 1 > kdamonds/0/contexts/nr_contexts
|
||||
# echo vaddr > kdamonds/0/contexts/0/operations
|
||||
# echo 1 > kdamonds/0/contexts/0/targets/nr
|
||||
# echo $(pidof <workload>) > kdamonds/0/contexts/0/targets/0/pid
|
||||
# echo 1 > kdamonds/0/contexts/0/targets/nr_targets
|
||||
# echo $(pidof <workload>) > kdamonds/0/contexts/0/targets/0/pid_target
|
||||
# echo on > kdamonds/0/state
|
||||
|
||||
Files Hierarchy
|
||||
@@ -366,12 +366,12 @@ memory rate becomes larger than 60%, or lower than 30%". ::
|
||||
# echo 1 > kdamonds/0/contexts/0/schemes/nr_schemes
|
||||
# cd kdamonds/0/contexts/0/schemes/0
|
||||
# # set the basic access pattern and the action
|
||||
# echo 4096 > access_patterns/sz/min
|
||||
# echo 8192 > access_patterns/sz/max
|
||||
# echo 0 > access_patterns/nr_accesses/min
|
||||
# echo 5 > access_patterns/nr_accesses/max
|
||||
# echo 10 > access_patterns/age/min
|
||||
# echo 20 > access_patterns/age/max
|
||||
# echo 4096 > access_pattern/sz/min
|
||||
# echo 8192 > access_pattern/sz/max
|
||||
# echo 0 > access_pattern/nr_accesses/min
|
||||
# echo 5 > access_pattern/nr_accesses/max
|
||||
# echo 10 > access_pattern/age/min
|
||||
# echo 20 > access_pattern/age/max
|
||||
# echo pageout > action
|
||||
# # set quotas
|
||||
# echo 10 > quotas/ms
|
||||
@@ -393,6 +393,11 @@ the files as above. Above is only for an example.
|
||||
debugfs Interface
|
||||
=================
|
||||
|
||||
.. note::
|
||||
|
||||
DAMON debugfs interface will be removed after next LTS kernel is released, so
|
||||
users should move to the :ref:`sysfs interface <sysfs_interface>`.
|
||||
|
||||
DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
|
||||
``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
|
||||
``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
|
||||
|
||||
@@ -65,7 +65,7 @@ HugePages_Surp
|
||||
may be temporarily larger than the maximum number of surplus huge
|
||||
pages when the system is under memory pressure.
|
||||
Hugepagesize
|
||||
is the default hugepage size (in Kb).
|
||||
is the default hugepage size (in kB).
|
||||
Hugetlb
|
||||
is the total amount of memory (in kB), consumed by huge
|
||||
pages of all sizes.
|
||||
|
||||
@@ -32,6 +32,7 @@ the Linux memory management.
|
||||
idle_page_tracking
|
||||
ksm
|
||||
memory-hotplug
|
||||
multigen_lru
|
||||
nommu-mmap
|
||||
numa_memory_policy
|
||||
numaperf
|
||||
|
||||
@@ -184,6 +184,42 @@ The maximum possible ``pages_sharing/pages_shared`` ratio is limited by the
|
||||
``max_page_sharing`` tunable. To increase the ratio ``max_page_sharing`` must
|
||||
be increased accordingly.
|
||||
|
||||
Monitoring KSM profit
|
||||
=====================
|
||||
|
||||
KSM can save memory by merging identical pages, but also can consume
|
||||
additional memory, because it needs to generate a number of rmap_items to
|
||||
save each scanned page's brief rmap information. Some of these pages may
|
||||
be merged, but some may not be abled to be merged after being checked
|
||||
several times, which are unprofitable memory consumed.
|
||||
|
||||
1) How to determine whether KSM save memory or consume memory in system-wide
|
||||
range? Here is a simple approximate calculation for reference::
|
||||
|
||||
general_profit =~ pages_sharing * sizeof(page) - (all_rmap_items) *
|
||||
sizeof(rmap_item);
|
||||
|
||||
where all_rmap_items can be easily obtained by summing ``pages_sharing``,
|
||||
``pages_shared``, ``pages_unshared`` and ``pages_volatile``.
|
||||
|
||||
2) The KSM profit inner a single process can be similarly obtained by the
|
||||
following approximate calculation::
|
||||
|
||||
process_profit =~ ksm_merging_pages * sizeof(page) -
|
||||
ksm_rmap_items * sizeof(rmap_item).
|
||||
|
||||
where ksm_merging_pages is shown under the directory ``/proc/<pid>/``,
|
||||
and ksm_rmap_items is shown in ``/proc/<pid>/ksm_stat``.
|
||||
|
||||
From the perspective of application, a high ratio of ``ksm_rmap_items`` to
|
||||
``ksm_merging_pages`` means a bad madvise-applied policy, so developers or
|
||||
administrators have to rethink how to change madvise policy. Giving an example
|
||||
for reference, a page's size is usually 4K, and the rmap_item's size is
|
||||
separately 32B on 32-bit CPU architecture and 64B on 64-bit CPU architecture.
|
||||
so if the ``ksm_rmap_items/ksm_merging_pages`` ratio exceeds 64 on 64-bit CPU
|
||||
or exceeds 128 on 32-bit CPU, then the app's madvise policy should be dropped,
|
||||
because the ksm profit is approximately zero or negative.
|
||||
|
||||
Monitoring KSM events
|
||||
=====================
|
||||
|
||||
|
||||
162
Documentation/admin-guide/mm/multigen_lru.rst
Normal file
162
Documentation/admin-guide/mm/multigen_lru.rst
Normal file
@@ -0,0 +1,162 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=============
|
||||
Multi-Gen LRU
|
||||
=============
|
||||
The multi-gen LRU is an alternative LRU implementation that optimizes
|
||||
page reclaim and improves performance under memory pressure. Page
|
||||
reclaim decides the kernel's caching policy and ability to overcommit
|
||||
memory. It directly impacts the kswapd CPU usage and RAM efficiency.
|
||||
|
||||
Quick start
|
||||
===========
|
||||
Build the kernel with the following configurations.
|
||||
|
||||
* ``CONFIG_LRU_GEN=y``
|
||||
* ``CONFIG_LRU_GEN_ENABLED=y``
|
||||
|
||||
All set!
|
||||
|
||||
Runtime options
|
||||
===============
|
||||
``/sys/kernel/mm/lru_gen/`` contains stable ABIs described in the
|
||||
following subsections.
|
||||
|
||||
Kill switch
|
||||
-----------
|
||||
``enabled`` accepts different values to enable or disable the
|
||||
following components. Its default value depends on
|
||||
``CONFIG_LRU_GEN_ENABLED``. All the components should be enabled
|
||||
unless some of them have unforeseen side effects. Writing to
|
||||
``enabled`` has no effect when a component is not supported by the
|
||||
hardware, and valid values will be accepted even when the main switch
|
||||
is off.
|
||||
|
||||
====== ===============================================================
|
||||
Values Components
|
||||
====== ===============================================================
|
||||
0x0001 The main switch for the multi-gen LRU.
|
||||
0x0002 Clearing the accessed bit in leaf page table entries in large
|
||||
batches, when MMU sets it (e.g., on x86). This behavior can
|
||||
theoretically worsen lock contention (mmap_lock). If it is
|
||||
disabled, the multi-gen LRU will suffer a minor performance
|
||||
degradation for workloads that contiguously map hot pages,
|
||||
whose accessed bits can be otherwise cleared by fewer larger
|
||||
batches.
|
||||
0x0004 Clearing the accessed bit in non-leaf page table entries as
|
||||
well, when MMU sets it (e.g., on x86). This behavior was not
|
||||
verified on x86 varieties other than Intel and AMD. If it is
|
||||
disabled, the multi-gen LRU will suffer a negligible
|
||||
performance degradation.
|
||||
[yYnN] Apply to all the components above.
|
||||
====== ===============================================================
|
||||
|
||||
E.g.,
|
||||
::
|
||||
|
||||
echo y >/sys/kernel/mm/lru_gen/enabled
|
||||
cat /sys/kernel/mm/lru_gen/enabled
|
||||
0x0007
|
||||
echo 5 >/sys/kernel/mm/lru_gen/enabled
|
||||
cat /sys/kernel/mm/lru_gen/enabled
|
||||
0x0005
|
||||
|
||||
Thrashing prevention
|
||||
--------------------
|
||||
Personal computers are more sensitive to thrashing because it can
|
||||
cause janks (lags when rendering UI) and negatively impact user
|
||||
experience. The multi-gen LRU offers thrashing prevention to the
|
||||
majority of laptop and desktop users who do not have ``oomd``.
|
||||
|
||||
Users can write ``N`` to ``min_ttl_ms`` to prevent the working set of
|
||||
``N`` milliseconds from getting evicted. The OOM killer is triggered
|
||||
if this working set cannot be kept in memory. In other words, this
|
||||
option works as an adjustable pressure relief valve, and when open, it
|
||||
terminates applications that are hopefully not being used.
|
||||
|
||||
Based on the average human detectable lag (~100ms), ``N=1000`` usually
|
||||
eliminates intolerable janks due to thrashing. Larger values like
|
||||
``N=3000`` make janks less noticeable at the risk of premature OOM
|
||||
kills.
|
||||
|
||||
The default value ``0`` means disabled.
|
||||
|
||||
Experimental features
|
||||
=====================
|
||||
``/sys/kernel/debug/lru_gen`` accepts commands described in the
|
||||
following subsections. Multiple command lines are supported, so does
|
||||
concatenation with delimiters ``,`` and ``;``.
|
||||
|
||||
``/sys/kernel/debug/lru_gen_full`` provides additional stats for
|
||||
debugging. ``CONFIG_LRU_GEN_STATS=y`` keeps historical stats from
|
||||
evicted generations in this file.
|
||||
|
||||
Working set estimation
|
||||
----------------------
|
||||
Working set estimation measures how much memory an application needs
|
||||
in a given time interval, and it is usually done with little impact on
|
||||
the performance of the application. E.g., data centers want to
|
||||
optimize job scheduling (bin packing) to improve memory utilizations.
|
||||
When a new job comes in, the job scheduler needs to find out whether
|
||||
each server it manages can allocate a certain amount of memory for
|
||||
this new job before it can pick a candidate. To do so, the job
|
||||
scheduler needs to estimate the working sets of the existing jobs.
|
||||
|
||||
When it is read, ``lru_gen`` returns a histogram of numbers of pages
|
||||
accessed over different time intervals for each memcg and node.
|
||||
``MAX_NR_GENS`` decides the number of bins for each histogram. The
|
||||
histograms are noncumulative.
|
||||
::
|
||||
|
||||
memcg memcg_id memcg_path
|
||||
node node_id
|
||||
min_gen_nr age_in_ms nr_anon_pages nr_file_pages
|
||||
...
|
||||
max_gen_nr age_in_ms nr_anon_pages nr_file_pages
|
||||
|
||||
Each bin contains an estimated number of pages that have been accessed
|
||||
within ``age_in_ms``. E.g., ``min_gen_nr`` contains the coldest pages
|
||||
and ``max_gen_nr`` contains the hottest pages, since ``age_in_ms`` of
|
||||
the former is the largest and that of the latter is the smallest.
|
||||
|
||||
Users can write the following command to ``lru_gen`` to create a new
|
||||
generation ``max_gen_nr+1``:
|
||||
|
||||
``+ memcg_id node_id max_gen_nr [can_swap [force_scan]]``
|
||||
|
||||
``can_swap`` defaults to the swap setting and, if it is set to ``1``,
|
||||
it forces the scan of anon pages when swap is off, and vice versa.
|
||||
``force_scan`` defaults to ``1`` and, if it is set to ``0``, it
|
||||
employs heuristics to reduce the overhead, which is likely to reduce
|
||||
the coverage as well.
|
||||
|
||||
A typical use case is that a job scheduler runs this command at a
|
||||
certain time interval to create new generations, and it ranks the
|
||||
servers it manages based on the sizes of their cold pages defined by
|
||||
this time interval.
|
||||
|
||||
Proactive reclaim
|
||||
-----------------
|
||||
Proactive reclaim induces page reclaim when there is no memory
|
||||
pressure. It usually targets cold pages only. E.g., when a new job
|
||||
comes in, the job scheduler wants to proactively reclaim cold pages on
|
||||
the server it selected, to improve the chance of successfully landing
|
||||
this new job.
|
||||
|
||||
Users can write the following command to ``lru_gen`` to evict
|
||||
generations less than or equal to ``min_gen_nr``.
|
||||
|
||||
``- memcg_id node_id min_gen_nr [swappiness [nr_to_reclaim]]``
|
||||
|
||||
``min_gen_nr`` should be less than ``max_gen_nr-1``, since
|
||||
``max_gen_nr`` and ``max_gen_nr-1`` are not fully aged (equivalent to
|
||||
the active list) and therefore cannot be evicted. ``swappiness``
|
||||
overrides the default value in ``/proc/sys/vm/swappiness``.
|
||||
``nr_to_reclaim`` limits the number of pages to evict.
|
||||
|
||||
A typical use case is that a job scheduler runs this command before it
|
||||
tries to land a new job on a server. If it fails to materialize enough
|
||||
cold pages because of the overestimation, it retries on the next
|
||||
server according to the ranking result obtained from the working set
|
||||
estimation step. This less forceful approach limits the impacts on the
|
||||
existing jobs.
|
||||
@@ -191,7 +191,14 @@ allocation failure to throttle the next allocation attempt::
|
||||
|
||||
/sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs
|
||||
|
||||
The khugepaged progress can be seen in the number of pages collapsed::
|
||||
The khugepaged progress can be seen in the number of pages collapsed (note
|
||||
that this counter may not be an exact count of the number of pages
|
||||
collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping
|
||||
being replaced by a PMD mapping, or (2) All 4K physical pages replaced by
|
||||
one 2M hugepage. Each may happen independently, or together, depending on
|
||||
the type of memory and the failures that occur. As such, this value should
|
||||
be interpreted roughly as a sign of progress, and counters in /proc/vmstat
|
||||
consulted for more accurate accounting)::
|
||||
|
||||
/sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed
|
||||
|
||||
@@ -366,10 +373,9 @@ thp_split_pmd
|
||||
page table entry.
|
||||
|
||||
thp_zero_page_alloc
|
||||
is incremented every time a huge zero page is
|
||||
successfully allocated. It includes allocations which where
|
||||
dropped due race with other allocation. Note, it doesn't count
|
||||
every map of the huge zero page, only its allocation.
|
||||
is incremented every time a huge zero page used for thp is
|
||||
successfully allocated. Note, it doesn't count every map of
|
||||
the huge zero page, only its allocation.
|
||||
|
||||
thp_zero_page_alloc_failed
|
||||
is incremented if kernel fails to allocate
|
||||
|
||||
@@ -17,7 +17,10 @@ of the ``PROT_NONE+SIGSEGV`` trick.
|
||||
Design
|
||||
======
|
||||
|
||||
Userfaults are delivered and resolved through the ``userfaultfd`` syscall.
|
||||
Userspace creates a new userfaultfd, initializes it, and registers one or more
|
||||
regions of virtual memory with it. Then, any page faults which occur within the
|
||||
region(s) result in a message being delivered to the userfaultfd, notifying
|
||||
userspace of the fault.
|
||||
|
||||
The ``userfaultfd`` (aside from registering and unregistering virtual
|
||||
memory ranges) provides two primary functionalities:
|
||||
@@ -34,12 +37,11 @@ The real advantage of userfaults if compared to regular virtual memory
|
||||
management of mremap/mprotect is that the userfaults in all their
|
||||
operations never involve heavyweight structures like vmas (in fact the
|
||||
``userfaultfd`` runtime load never takes the mmap_lock for writing).
|
||||
|
||||
Vmas are not suitable for page- (or hugepage) granular fault tracking
|
||||
when dealing with virtual address spaces that could span
|
||||
Terabytes. Too many vmas would be needed for that.
|
||||
|
||||
The ``userfaultfd`` once opened by invoking the syscall, can also be
|
||||
The ``userfaultfd``, once created, can also be
|
||||
passed using unix domain sockets to a manager process, so the same
|
||||
manager process could handle the userfaults of a multitude of
|
||||
different processes without them being aware about what is going on
|
||||
@@ -50,6 +52,39 @@ is a corner case that would currently return ``-EBUSY``).
|
||||
API
|
||||
===
|
||||
|
||||
Creating a userfaultfd
|
||||
----------------------
|
||||
|
||||
There are two ways to create a new userfaultfd, each of which provide ways to
|
||||
restrict access to this functionality (since historically userfaultfds which
|
||||
handle kernel page faults have been a useful tool for exploiting the kernel).
|
||||
|
||||
The first way, supported since userfaultfd was introduced, is the
|
||||
userfaultfd(2) syscall. Access to this is controlled in several ways:
|
||||
|
||||
- Any user can always create a userfaultfd which traps userspace page faults
|
||||
only. Such a userfaultfd can be created using the userfaultfd(2) syscall
|
||||
with the flag UFFD_USER_MODE_ONLY.
|
||||
|
||||
- In order to also trap kernel page faults for the address space, either the
|
||||
process needs the CAP_SYS_PTRACE capability, or the system must have
|
||||
vm.unprivileged_userfaultfd set to 1. By default, vm.unprivileged_userfaultfd
|
||||
is set to 0.
|
||||
|
||||
The second way, added to the kernel more recently, is by opening
|
||||
/dev/userfaultfd and issuing a USERFAULTFD_IOC_NEW ioctl to it. This method
|
||||
yields equivalent userfaultfds to the userfaultfd(2) syscall.
|
||||
|
||||
Unlike userfaultfd(2), access to /dev/userfaultfd is controlled via normal
|
||||
filesystem permissions (user/group/mode), which gives fine grained access to
|
||||
userfaultfd specifically, without also granting other unrelated privileges at
|
||||
the same time (as e.g. granting CAP_SYS_PTRACE would do). Users who have access
|
||||
to /dev/userfaultfd can always create userfaultfds that trap kernel page faults;
|
||||
vm.unprivileged_userfaultfd is not considered.
|
||||
|
||||
Initializing a userfaultfd
|
||||
--------------------------
|
||||
|
||||
When first opened the ``userfaultfd`` must be enabled invoking the
|
||||
``UFFDIO_API`` ioctl specifying a ``uffdio_api.api`` value set to ``UFFD_API`` (or
|
||||
a later API version) which will specify the ``read/POLLIN`` protocol
|
||||
|
||||
100
Documentation/admin-guide/perf/alibaba_pmu.rst
Normal file
100
Documentation/admin-guide/perf/alibaba_pmu.rst
Normal file
@@ -0,0 +1,100 @@
|
||||
=============================================================
|
||||
Alibaba's T-Head SoC Uncore Performance Monitoring Unit (PMU)
|
||||
=============================================================
|
||||
|
||||
The Yitian 710, custom-built by Alibaba Group's chip development business,
|
||||
T-Head, implements uncore PMU for performance and functional debugging to
|
||||
facilitate system maintenance.
|
||||
|
||||
DDR Sub-System Driveway (DRW) PMU Driver
|
||||
=========================================
|
||||
|
||||
Yitian 710 employs eight DDR5/4 channels, four on each die. Each DDR5 channel
|
||||
is independent of others to service system memory requests. And one DDR5
|
||||
channel is split into two independent sub-channels. The DDR Sub-System Driveway
|
||||
implements separate PMUs for each sub-channel to monitor various performance
|
||||
metrics.
|
||||
|
||||
The Driveway PMU devices are named as ali_drw_<sys_base_addr> with perf.
|
||||
For example, ali_drw_21000 and ali_drw_21080 are two PMU devices for two
|
||||
sub-channels of the same channel in die 0. And the PMU device of die 1 is
|
||||
prefixed with ali_drw_400XXXXX, e.g. ali_drw_40021000.
|
||||
|
||||
Each sub-channel has 36 PMU counters in total, which is classified into
|
||||
four groups:
|
||||
|
||||
- Group 0: PMU Cycle Counter. This group has one pair of counters
|
||||
pmu_cycle_cnt_low and pmu_cycle_cnt_high, that is used as the cycle count
|
||||
based on DDRC core clock.
|
||||
|
||||
- Group 1: PMU Bandwidth Counters. This group has 8 counters that are used
|
||||
to count the total access number of either the eight bank groups in a
|
||||
selected rank, or four ranks separately in the first 4 counters. The base
|
||||
transfer unit is 64B.
|
||||
|
||||
- Group 2: PMU Retry Counters. This group has 10 counters, that intend to
|
||||
count the total retry number of each type of uncorrectable error.
|
||||
|
||||
- Group 3: PMU Common Counters. This group has 16 counters, that are used
|
||||
to count the common events.
|
||||
|
||||
For now, the Driveway PMU driver only uses counters in group 0 and group 3.
|
||||
|
||||
The DDR Controller (DDRCTL) and DDR PHY combine to create a complete solution
|
||||
for connecting an SoC application bus to DDR memory devices. The DDRCTL
|
||||
receives transactions Host Interface (HIF) which is custom-defined by Synopsys.
|
||||
These transactions are queued internally and scheduled for access while
|
||||
satisfying the SDRAM protocol timing requirements, transaction priorities, and
|
||||
dependencies between the transactions. The DDRCTL in turn issues commands on
|
||||
the DDR PHY Interface (DFI) to the PHY module, which launches and captures data
|
||||
to and from the SDRAM. The driveway PMUs have hardware logic to gather
|
||||
statistics and performance logging signals on HIF, DFI, etc.
|
||||
|
||||
By counting the READ, WRITE and RMW commands sent to the DDRC through the HIF
|
||||
interface, we could calculate the bandwidth. Example usage of counting memory
|
||||
data bandwidth::
|
||||
|
||||
perf stat \
|
||||
-e ali_drw_21000/hif_wr/ \
|
||||
-e ali_drw_21000/hif_rd/ \
|
||||
-e ali_drw_21000/hif_rmw/ \
|
||||
-e ali_drw_21000/cycle/ \
|
||||
-e ali_drw_21080/hif_wr/ \
|
||||
-e ali_drw_21080/hif_rd/ \
|
||||
-e ali_drw_21080/hif_rmw/ \
|
||||
-e ali_drw_21080/cycle/ \
|
||||
-e ali_drw_23000/hif_wr/ \
|
||||
-e ali_drw_23000/hif_rd/ \
|
||||
-e ali_drw_23000/hif_rmw/ \
|
||||
-e ali_drw_23000/cycle/ \
|
||||
-e ali_drw_23080/hif_wr/ \
|
||||
-e ali_drw_23080/hif_rd/ \
|
||||
-e ali_drw_23080/hif_rmw/ \
|
||||
-e ali_drw_23080/cycle/ \
|
||||
-e ali_drw_25000/hif_wr/ \
|
||||
-e ali_drw_25000/hif_rd/ \
|
||||
-e ali_drw_25000/hif_rmw/ \
|
||||
-e ali_drw_25000/cycle/ \
|
||||
-e ali_drw_25080/hif_wr/ \
|
||||
-e ali_drw_25080/hif_rd/ \
|
||||
-e ali_drw_25080/hif_rmw/ \
|
||||
-e ali_drw_25080/cycle/ \
|
||||
-e ali_drw_27000/hif_wr/ \
|
||||
-e ali_drw_27000/hif_rd/ \
|
||||
-e ali_drw_27000/hif_rmw/ \
|
||||
-e ali_drw_27000/cycle/ \
|
||||
-e ali_drw_27080/hif_wr/ \
|
||||
-e ali_drw_27080/hif_rd/ \
|
||||
-e ali_drw_27080/hif_rmw/ \
|
||||
-e ali_drw_27080/cycle/ -- sleep 10
|
||||
|
||||
The average DRAM bandwidth can be calculated as follows:
|
||||
|
||||
- Read Bandwidth = perf_hif_rd * DDRC_WIDTH * DDRC_Freq / DDRC_Cycle
|
||||
- Write Bandwidth = (perf_hif_wr + perf_hif_rmw) * DDRC_WIDTH * DDRC_Freq / DDRC_Cycle
|
||||
|
||||
Here, DDRC_WIDTH = 64 bytes.
|
||||
|
||||
The current driver does not support sampling. So "perf record" is
|
||||
unsupported. Also attach to a task is unsupported as the events are all
|
||||
uncore.
|
||||
@@ -18,3 +18,4 @@ Performance monitor support
|
||||
xgene-pmu
|
||||
arm_dsu_pmu
|
||||
thunderx2-pmu
|
||||
alibaba_pmu
|
||||
|
||||
@@ -182,6 +182,7 @@ to the ``struct sugov_cpu`` that the utilization update belongs to.
|
||||
Then, ``amd-pstate`` updates the desired performance according to the CPU
|
||||
scheduler assigned.
|
||||
|
||||
.. _processor_support:
|
||||
|
||||
Processor Support
|
||||
=======================
|
||||
@@ -282,6 +283,8 @@ efficiency frequency management method on AMD processors.
|
||||
Kernel Module Options for ``amd-pstate``
|
||||
=========================================
|
||||
|
||||
.. _shared_mem:
|
||||
|
||||
``shared_mem``
|
||||
Use a module param (shared_mem) to enable related processors manually with
|
||||
**amd_pstate.shared_mem=1**.
|
||||
@@ -393,6 +396,76 @@ about part of the output. ::
|
||||
CPU_005 712 116384 39 49 166 0.7565 9645075 2214891 38431470 25.1 11.646 469 2.496 kworker/5:0-40
|
||||
CPU_006 712 116408 39 49 166 0.6769 8950227 1839034 37192089 24.06 11.272 470 2.496 kworker/6:0-1264
|
||||
|
||||
Unit Tests for amd-pstate
|
||||
-------------------------
|
||||
|
||||
``amd-pstate-ut`` is a test module for testing the ``amd-pstate`` driver.
|
||||
|
||||
* It can help all users to verify their processor support (SBIOS/Firmware or Hardware).
|
||||
|
||||
* Kernel can have a basic function test to avoid the kernel regression during the update.
|
||||
|
||||
* We can introduce more functional or performance tests to align the result together, it will benefit power and performance scale optimization.
|
||||
|
||||
1. Test case decriptions
|
||||
|
||||
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||
| Index | Functions | Description |
|
||||
+=========+================================+====================================================================================+
|
||||
| 0 | amd_pstate_ut_acpi_cpc_valid || Check whether the _CPC object is present in SBIOS. |
|
||||
| | || |
|
||||
| | || The detail refer to `Processor Support <processor_support_>`_. |
|
||||
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||
| 1 | amd_pstate_ut_check_enabled || Check whether AMD P-State is enabled. |
|
||||
| | || |
|
||||
| | || AMD P-States and ACPI hardware P-States always can be supported in one processor. |
|
||||
| | | But AMD P-States has the higher priority and if it is enabled with |
|
||||
| | | :c:macro:`MSR_AMD_CPPC_ENABLE` or ``cppc_set_enable``, it will respond to the |
|
||||
| | | request from AMD P-States. |
|
||||
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||
| 2 | amd_pstate_ut_check_perf || Check if the each performance values are reasonable. |
|
||||
| | || highest_perf >= nominal_perf > lowest_nonlinear_perf > lowest_perf > 0. |
|
||||
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||
| 3 | amd_pstate_ut_check_freq || Check if the each frequency values and max freq when set support boost mode |
|
||||
| | | are reasonable. |
|
||||
| | || max_freq >= nominal_freq > lowest_nonlinear_freq > min_freq > 0 |
|
||||
| | || If boost is not active but supported, this maximum frequency will be larger than |
|
||||
| | | the one in ``cpuinfo``. |
|
||||
+---------+--------------------------------+------------------------------------------------------------------------------------+
|
||||
|
||||
#. How to execute the tests
|
||||
|
||||
We use test module in the kselftest frameworks to implement it.
|
||||
We create amd-pstate-ut module and tie it into kselftest.(for
|
||||
details refer to Linux Kernel Selftests [4]_).
|
||||
|
||||
1. Build
|
||||
|
||||
+ open the :c:macro:`CONFIG_X86_AMD_PSTATE` configuration option.
|
||||
+ set the :c:macro:`CONFIG_X86_AMD_PSTATE_UT` configuration option to M.
|
||||
+ make project
|
||||
+ make selftest ::
|
||||
|
||||
$ cd linux
|
||||
$ make -C tools/testing/selftests
|
||||
|
||||
#. Installation & Steps ::
|
||||
|
||||
$ make -C tools/testing/selftests install INSTALL_PATH=~/kselftest
|
||||
$ sudo ./kselftest/run_kselftest.sh -c amd-pstate
|
||||
TAP version 13
|
||||
1..1
|
||||
# selftests: amd-pstate: amd-pstate-ut.sh
|
||||
# amd-pstate-ut: ok
|
||||
ok 1 selftests: amd-pstate: amd-pstate-ut.sh
|
||||
|
||||
#. Results ::
|
||||
|
||||
$ dmesg | grep "amd_pstate_ut" | tee log.txt
|
||||
[12977.570663] amd_pstate_ut: 1 amd_pstate_ut_acpi_cpc_valid success!
|
||||
[12977.570673] amd_pstate_ut: 2 amd_pstate_ut_check_enabled success!
|
||||
[12977.571207] amd_pstate_ut: 3 amd_pstate_ut_check_perf success!
|
||||
[12977.571212] amd_pstate_ut: 4 amd_pstate_ut_check_freq success!
|
||||
|
||||
Reference
|
||||
===========
|
||||
@@ -405,3 +478,6 @@ Reference
|
||||
|
||||
.. [3] Processor Programming Reference (PPR) for AMD Family 19h Model 51h, Revision A1 Processors
|
||||
https://www.amd.com/system/files/TechDocs/56569-A1-PUB.zip
|
||||
|
||||
.. [4] Linux Kernel Selftests,
|
||||
https://www.kernel.org/doc/html/latest/dev-tools/kselftest.html
|
||||
|
||||
@@ -65,6 +65,11 @@ combining the following values:
|
||||
4 s3_beep
|
||||
= =======
|
||||
|
||||
arch
|
||||
====
|
||||
|
||||
The machine hardware name, the same output as ``uname -m``
|
||||
(e.g. ``x86_64`` or ``aarch64``).
|
||||
|
||||
auto_msgmni
|
||||
===========
|
||||
@@ -635,6 +640,17 @@ different types of memory (represented as different NUMA nodes) to
|
||||
place the hot pages in the fast memory. This is implemented based on
|
||||
unmapping and page fault too.
|
||||
|
||||
numa_balancing_promote_rate_limit_MBps
|
||||
======================================
|
||||
|
||||
Too high promotion/demotion throughput between different memory types
|
||||
may hurt application latency. This can be used to rate limit the
|
||||
promotion throughput. The per-node max promotion throughput in MB/s
|
||||
will be limited to be no more than the set value.
|
||||
|
||||
A rule of thumb is to set this to less than 1/10 of the PMEM node
|
||||
write bandwidth.
|
||||
|
||||
oops_all_cpu_backtrace
|
||||
======================
|
||||
|
||||
|
||||
@@ -31,17 +31,18 @@ see only some of them, depending on your kernel's configuration.
|
||||
|
||||
Table : Subdirectories in /proc/sys/net
|
||||
|
||||
========= =================== = ========== ==================
|
||||
========= =================== = ========== ===================
|
||||
Directory Content Directory Content
|
||||
========= =================== = ========== ==================
|
||||
core General parameter appletalk Appletalk protocol
|
||||
unix Unix domain sockets netrom NET/ROM
|
||||
802 E802 protocol ax25 AX25
|
||||
ethernet Ethernet protocol rose X.25 PLP layer
|
||||
========= =================== = ========== ===================
|
||||
802 E802 protocol mptcp Multipath TCP
|
||||
appletalk Appletalk protocol netfilter Network Filter
|
||||
ax25 AX25 netrom NET/ROM
|
||||
bridge Bridging rose X.25 PLP layer
|
||||
core General parameter tipc TIPC
|
||||
ethernet Ethernet protocol unix Unix domain sockets
|
||||
ipv4 IP version 4 x25 X.25 protocol
|
||||
bridge Bridging decnet DEC net
|
||||
ipv6 IP version 6 tipc TIPC
|
||||
========= =================== = ========== ==================
|
||||
ipv6 IP version 6
|
||||
========= =================== = ========== ===================
|
||||
|
||||
1. /proc/sys/net/core - Network core options
|
||||
============================================
|
||||
@@ -101,6 +102,9 @@ Values:
|
||||
- 1 - enable JIT hardening for unprivileged users only
|
||||
- 2 - enable JIT hardening for all users
|
||||
|
||||
where "privileged user" in this context means a process having
|
||||
CAP_BPF or CAP_SYS_ADMIN in the root user name space.
|
||||
|
||||
bpf_jit_kallsyms
|
||||
----------------
|
||||
|
||||
@@ -271,7 +275,7 @@ poll cycle or the number of packets processed reaches netdev_budget.
|
||||
netdev_max_backlog
|
||||
------------------
|
||||
|
||||
Maximum number of packets, queued on the INPUT side, when the interface
|
||||
Maximum number of packets, queued on the INPUT side, when the interface
|
||||
receives packets faster than kernel can process them.
|
||||
|
||||
netdev_rss_key
|
||||
|
||||
@@ -926,6 +926,9 @@ calls without any restrictions.
|
||||
|
||||
The default value is 0.
|
||||
|
||||
Another way to control permissions for userfaultfd is to use
|
||||
/dev/userfaultfd instead of userfaultfd(2). See
|
||||
Documentation/admin-guide/mm/userfaultfd.rst.
|
||||
|
||||
user_reserve_kbytes
|
||||
===================
|
||||
|
||||
@@ -134,6 +134,12 @@ More detailed explanation for tainting
|
||||
scsi/snic on something else than x86_64, scsi/ips on non
|
||||
x86/x86_64/itanium, have broken firmware settings for the
|
||||
irqchip/irq-gic on arm64 ...).
|
||||
- x86/x86_64: Microcode late loading is dangerous and will result in
|
||||
tainting the kernel. It requires that all CPUs rendezvous to make sure
|
||||
the update happens when the system is as quiescent as possible. However,
|
||||
a higher priority MCE/SMI/NMI can move control flow away from that
|
||||
rendezvous and interrupt the update, which can be detrimental to the
|
||||
machine.
|
||||
|
||||
3) ``R`` if a module was force unloaded by ``rmmod -f``, ``' '`` if all
|
||||
modules were unloaded normally.
|
||||
|
||||
@@ -59,6 +59,7 @@ SoC-specific documents
|
||||
stm32/stm32f429-overview
|
||||
stm32/stm32mp13-overview
|
||||
stm32/stm32mp157-overview
|
||||
stm32/stm32-dma-mdma-chaining
|
||||
|
||||
sunxi
|
||||
|
||||
|
||||
415
Documentation/arm/stm32/stm32-dma-mdma-chaining.rst
Normal file
415
Documentation/arm/stm32/stm32-dma-mdma-chaining.rst
Normal file
@@ -0,0 +1,415 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=======================
|
||||
STM32 DMA-MDMA chaining
|
||||
=======================
|
||||
|
||||
|
||||
Introduction
|
||||
------------
|
||||
|
||||
This document describes the STM32 DMA-MDMA chaining feature. But before going
|
||||
further, let's introduce the peripherals involved.
|
||||
|
||||
To offload data transfers from the CPU, STM32 microprocessors (MPUs) embed
|
||||
direct memory access controllers (DMA).
|
||||
|
||||
STM32MP1 SoCs embed both STM32 DMA and STM32 MDMA controllers. STM32 DMA
|
||||
request routing capabilities are enhanced by a DMA request multiplexer
|
||||
(STM32 DMAMUX).
|
||||
|
||||
**STM32 DMAMUX**
|
||||
|
||||
STM32 DMAMUX routes any DMA request from a given peripheral to any STM32 DMA
|
||||
controller (STM32MP1 counts two STM32 DMA controllers) channels.
|
||||
|
||||
**STM32 DMA**
|
||||
|
||||
STM32 DMA is mainly used to implement central data buffer storage (usually in
|
||||
the system SRAM) for different peripheral. It can access external RAMs but
|
||||
without the ability to generate convenient burst transfer ensuring the best
|
||||
load of the AXI.
|
||||
|
||||
**STM32 MDMA**
|
||||
|
||||
STM32 MDMA (Master DMA) is mainly used to manage direct data transfers between
|
||||
RAM data buffers without CPU intervention. It can also be used in a
|
||||
hierarchical structure that uses STM32 DMA as first level data buffer
|
||||
interfaces for AHB peripherals, while the STM32 MDMA acts as a second level
|
||||
DMA with better performance. As a AXI/AHB master, STM32 MDMA can take control
|
||||
of the AXI/AHB bus.
|
||||
|
||||
|
||||
Principles
|
||||
----------
|
||||
|
||||
STM32 DMA-MDMA chaining feature relies on the strengths of STM32 DMA and
|
||||
STM32 MDMA controllers.
|
||||
|
||||
STM32 DMA has a circular Double Buffer Mode (DBM). At each end of transaction
|
||||
(when DMA data counter - DMA_SxNDTR - reaches 0), the memory pointers
|
||||
(configured with DMA_SxSM0AR and DMA_SxM1AR) are swapped and the DMA data
|
||||
counter is automatically reloaded. This allows the SW or the STM32 MDMA to
|
||||
process one memory area while the second memory area is being filled/used by
|
||||
the STM32 DMA transfer.
|
||||
|
||||
With STM32 MDMA linked-list mode, a single request initiates the data array
|
||||
(collection of nodes) to be transferred until the linked-list pointer for the
|
||||
channel is null. The channel transfer complete of the last node is the end of
|
||||
transfer, unless first and last nodes are linked to each other, in such a
|
||||
case, the linked-list loops on to create a circular MDMA transfer.
|
||||
|
||||
STM32 MDMA has direct connections with STM32 DMA. This enables autonomous
|
||||
communication and synchronization between peripherals, thus saving CPU
|
||||
resources and bus congestion. Transfer Complete signal of STM32 DMA channel
|
||||
can triggers STM32 MDMA transfer. STM32 MDMA can clear the request generated
|
||||
by the STM32 DMA by writing to its Interrupt Clear register (whose address is
|
||||
stored in MDMA_CxMAR, and bit mask in MDMA_CxMDR).
|
||||
|
||||
.. table:: STM32 MDMA interconnect table with STM32 DMA
|
||||
|
||||
+--------------+----------------+-----------+------------+
|
||||
| STM32 DMAMUX | STM32 DMA | STM32 DMA | STM32 MDMA |
|
||||
| channels | channels | Transfer | request |
|
||||
| | | complete | |
|
||||
| | | signal | |
|
||||
+==============+================+===========+============+
|
||||
| Channel *0* | DMA1 channel 0 | dma1_tcf0 | *0x00* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *1* | DMA1 channel 1 | dma1_tcf1 | *0x01* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *2* | DMA1 channel 2 | dma1_tcf2 | *0x02* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *3* | DMA1 channel 3 | dma1_tcf3 | *0x03* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *4* | DMA1 channel 4 | dma1_tcf4 | *0x04* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *5* | DMA1 channel 5 | dma1_tcf5 | *0x05* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *6* | DMA1 channel 6 | dma1_tcf6 | *0x06* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *7* | DMA1 channel 7 | dma1_tcf7 | *0x07* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *8* | DMA2 channel 0 | dma2_tcf0 | *0x08* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *9* | DMA2 channel 1 | dma2_tcf1 | *0x09* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *10* | DMA2 channel 2 | dma2_tcf2 | *0x0A* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *11* | DMA2 channel 3 | dma2_tcf3 | *0x0B* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *12* | DMA2 channel 4 | dma2_tcf4 | *0x0C* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *13* | DMA2 channel 5 | dma2_tcf5 | *0x0D* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *14* | DMA2 channel 6 | dma2_tcf6 | *0x0E* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
| Channel *15* | DMA2 channel 7 | dma2_tcf7 | *0x0F* |
|
||||
+--------------+----------------+-----------+------------+
|
||||
|
||||
STM32 DMA-MDMA chaining feature then uses a SRAM buffer. STM32MP1 SoCs embed
|
||||
three fast access static internal RAMs of various size, used for data storage.
|
||||
Due to STM32 DMA legacy (within microcontrollers), STM32 DMA performances are
|
||||
bad with DDR, while they are optimal with SRAM. Hence the SRAM buffer used
|
||||
between STM32 DMA and STM32 MDMA. This buffer is split in two equal periods
|
||||
and STM32 DMA uses one period while STM32 MDMA uses the other period
|
||||
simultaneously.
|
||||
::
|
||||
|
||||
dma[1:2]-tcf[0:7]
|
||||
.----------------.
|
||||
____________ ' _________ V____________
|
||||
| STM32 DMA | / __|>_ \ | STM32 MDMA |
|
||||
|------------| | / \ | |------------|
|
||||
| DMA_SxM0AR |<=>| | SRAM | |<=>| []-[]...[] |
|
||||
| DMA_SxM1AR | | \_____/ | | |
|
||||
|____________| \___<|____/ |____________|
|
||||
|
||||
STM32 DMA-MDMA chaining uses (struct dma_slave_config).peripheral_config to
|
||||
exchange the parameters needed to configure MDMA. These parameters are
|
||||
gathered into a u32 array with three values:
|
||||
|
||||
* the STM32 MDMA request (which is actually the DMAMUX channel ID),
|
||||
* the address of the STM32 DMA register to clear the Transfer Complete
|
||||
interrupt flag,
|
||||
* the mask of the Transfer Complete interrupt flag of the STM32 DMA channel.
|
||||
|
||||
Device Tree updates for STM32 DMA-MDMA chaining support
|
||||
-------------------------------------------------------
|
||||
|
||||
**1. Allocate a SRAM buffer**
|
||||
|
||||
SRAM device tree node is defined in SoC device tree. You can refer to it in
|
||||
your board device tree to define your SRAM pool.
|
||||
::
|
||||
|
||||
&sram {
|
||||
my_foo_device_dma_pool: dma-sram@0 {
|
||||
reg = <0x0 0x1000>;
|
||||
};
|
||||
};
|
||||
|
||||
Be careful of the start index, in case there are other SRAM consumers.
|
||||
Define your pool size strategically: to optimise chaining, the idea is that
|
||||
STM32 DMA and STM32 MDMA can work simultaneously, on each buffer of the
|
||||
SRAM.
|
||||
If the SRAM period is greater than the expected DMA transfer, then STM32 DMA
|
||||
and STM32 MDMA will work sequentially instead of simultaneously. It is not a
|
||||
functional issue but it is not optimal.
|
||||
|
||||
Don't forget to refer to your SRAM pool in your device node. You need to
|
||||
define a new property.
|
||||
::
|
||||
|
||||
&my_foo_device {
|
||||
...
|
||||
my_dma_pool = &my_foo_device_dma_pool;
|
||||
};
|
||||
|
||||
Then get this SRAM pool in your foo driver and allocate your SRAM buffer.
|
||||
|
||||
**2. Allocate a STM32 DMA channel and a STM32 MDMA channel**
|
||||
|
||||
You need to define an extra channel in your device tree node, in addition to
|
||||
the one you should already have for "classic" DMA operation.
|
||||
|
||||
This new channel must be taken from STM32 MDMA channels, so, the phandle of
|
||||
the DMA controller to use is the MDMA controller's one.
|
||||
::
|
||||
|
||||
&my_foo_device {
|
||||
[...]
|
||||
my_dma_pool = &my_foo_device_dma_pool;
|
||||
dmas = <&dmamux1 ...>, // STM32 DMA channel
|
||||
<&mdma1 0 0x3 0x1200000a 0 0>; // + STM32 MDMA channel
|
||||
};
|
||||
|
||||
Concerning STM32 MDMA bindings:
|
||||
|
||||
1. The request line number : whatever the value here, it will be overwritten
|
||||
by MDMA driver with the STM32 DMAMUX channel ID passed through
|
||||
(struct dma_slave_config).peripheral_config
|
||||
|
||||
2. The priority level : choose Very High (0x3) so that your channel will
|
||||
take priority other the other during request arbitration
|
||||
|
||||
3. A 32bit mask specifying the DMA channel configuration : source and
|
||||
destination address increment, block transfer with 128 bytes per single
|
||||
transfer
|
||||
|
||||
4. The 32bit value specifying the register to be used to acknowledge the
|
||||
request: it will be overwritten by MDMA driver, with the DMA channel
|
||||
interrupt flag clear register address passed through
|
||||
(struct dma_slave_config).peripheral_config
|
||||
|
||||
5. The 32bit mask specifying the value to be written to acknowledge the
|
||||
request: it will be overwritten by MDMA driver, with the DMA channel
|
||||
Transfer Complete flag passed through
|
||||
(struct dma_slave_config).peripheral_config
|
||||
|
||||
Driver updates for STM32 DMA-MDMA chaining support in foo driver
|
||||
----------------------------------------------------------------
|
||||
|
||||
**0. (optional) Refactor the original sg_table if dmaengine_prep_slave_sg()**
|
||||
|
||||
In case of dmaengine_prep_slave_sg(), the original sg_table can't be used as
|
||||
is. Two new sg_tables must be created from the original one. One for
|
||||
STM32 DMA transfer (where memory address targets now the SRAM buffer instead
|
||||
of DDR buffer) and one for STM32 MDMA transfer (where memory address targets
|
||||
the DDR buffer).
|
||||
|
||||
The new sg_list items must fit SRAM period length. Here is an example for
|
||||
DMA_DEV_TO_MEM:
|
||||
::
|
||||
|
||||
/*
|
||||
* Assuming sgl and nents, respectively the initial scatterlist and its
|
||||
* length.
|
||||
* Assuming sram_dma_buf and sram_period, respectively the memory
|
||||
* allocated from the pool for DMA usage, and the length of the period,
|
||||
* which is half of the sram_buf size.
|
||||
*/
|
||||
struct sg_table new_dma_sgt, new_mdma_sgt;
|
||||
struct scatterlist *s, *_sgl;
|
||||
dma_addr_t ddr_dma_buf;
|
||||
u32 new_nents = 0, len;
|
||||
int i;
|
||||
|
||||
/* Count the number of entries needed */
|
||||
for_each_sg(sgl, s, nents, i)
|
||||
if (sg_dma_len(s) > sram_period)
|
||||
new_nents += DIV_ROUND_UP(sg_dma_len(s), sram_period);
|
||||
else
|
||||
new_nents++;
|
||||
|
||||
/* Create sg table for STM32 DMA channel */
|
||||
ret = sg_alloc_table(&new_dma_sgt, new_nents, GFP_ATOMIC);
|
||||
if (ret)
|
||||
dev_err(dev, "DMA sg table alloc failed\n");
|
||||
|
||||
for_each_sg(new_dma_sgt.sgl, s, new_dma_sgt.nents, i) {
|
||||
_sgl = sgl;
|
||||
sg_dma_len(s) = min(sg_dma_len(_sgl), sram_period);
|
||||
/* Targets the beginning = first half of the sram_buf */
|
||||
s->dma_address = sram_buf;
|
||||
/*
|
||||
* Targets the second half of the sram_buf
|
||||
* for odd indexes of the item of the sg_list
|
||||
*/
|
||||
if (i & 1)
|
||||
s->dma_address += sram_period;
|
||||
}
|
||||
|
||||
/* Create sg table for STM32 MDMA channel */
|
||||
ret = sg_alloc_table(&new_mdma_sgt, new_nents, GFP_ATOMIC);
|
||||
if (ret)
|
||||
dev_err(dev, "MDMA sg_table alloc failed\n");
|
||||
|
||||
_sgl = sgl;
|
||||
len = sg_dma_len(sgl);
|
||||
ddr_dma_buf = sg_dma_address(sgl);
|
||||
for_each_sg(mdma_sgt.sgl, s, mdma_sgt.nents, i) {
|
||||
size_t bytes = min_t(size_t, len, sram_period);
|
||||
|
||||
sg_dma_len(s) = bytes;
|
||||
sg_dma_address(s) = ddr_dma_buf;
|
||||
len -= bytes;
|
||||
|
||||
if (!len && sg_next(_sgl)) {
|
||||
_sgl = sg_next(_sgl);
|
||||
len = sg_dma_len(_sgl);
|
||||
ddr_dma_buf = sg_dma_address(_sgl);
|
||||
} else {
|
||||
ddr_dma_buf += bytes;
|
||||
}
|
||||
}
|
||||
|
||||
Don't forget to release these new sg_tables after getting the descriptors
|
||||
with dmaengine_prep_slave_sg().
|
||||
|
||||
**1. Set controller specific parameters**
|
||||
|
||||
First, use dmaengine_slave_config() with a struct dma_slave_config to
|
||||
configure STM32 DMA channel. You just have to take care of DMA addresses,
|
||||
the memory address (depending on the transfer direction) must point on your
|
||||
SRAM buffer, and set (struct dma_slave_config).peripheral_size != 0.
|
||||
|
||||
STM32 DMA driver will check (struct dma_slave_config).peripheral_size to
|
||||
determine if chaining is being used or not. If it is used, then STM32 DMA
|
||||
driver fills (struct dma_slave_config).peripheral_config with an array of
|
||||
three u32 : the first one containing STM32 DMAMUX channel ID, the second one
|
||||
the channel interrupt flag clear register address, and the third one the
|
||||
channel Transfer Complete flag mask.
|
||||
|
||||
Then, use dmaengine_slave_config with another struct dma_slave_config to
|
||||
configure STM32 MDMA channel. Take care of DMA addresses, the device address
|
||||
(depending on the transfer direction) must point on your SRAM buffer, and
|
||||
the memory address must point to the buffer originally used for "classic"
|
||||
DMA operation. Use the previous (struct dma_slave_config).peripheral_size
|
||||
and .peripheral_config that have been updated by STM32 DMA driver, to set
|
||||
(struct dma_slave_config).peripheral_size and .peripheral_config of the
|
||||
struct dma_slave_config to configure STM32 MDMA channel.
|
||||
::
|
||||
|
||||
struct dma_slave_config dma_conf;
|
||||
struct dma_slave_config mdma_conf;
|
||||
|
||||
memset(&dma_conf, 0, sizeof(dma_conf));
|
||||
[...]
|
||||
config.direction = DMA_DEV_TO_MEM;
|
||||
config.dst_addr = sram_dma_buf; // SRAM buffer
|
||||
config.peripheral_size = 1; // peripheral_size != 0 => chaining
|
||||
|
||||
dmaengine_slave_config(dma_chan, &dma_config);
|
||||
|
||||
memset(&mdma_conf, 0, sizeof(mdma_conf));
|
||||
config.direction = DMA_DEV_TO_MEM;
|
||||
mdma_conf.src_addr = sram_dma_buf; // SRAM buffer
|
||||
mdma_conf.dst_addr = rx_dma_buf; // original memory buffer
|
||||
mdma_conf.peripheral_size = dma_conf.peripheral_size; // <- dma_conf
|
||||
mdma_conf.peripheral_config = dma_config.peripheral_config; // <- dma_conf
|
||||
|
||||
dmaengine_slave_config(mdma_chan, &mdma_conf);
|
||||
|
||||
**2. Get a descriptor for STM32 DMA channel transaction**
|
||||
|
||||
In the same way you get your descriptor for your "classic" DMA operation,
|
||||
you just have to replace the original sg_list (in case of
|
||||
dmaengine_prep_slave_sg()) with the new sg_list using SRAM buffer, or to
|
||||
replace the original buffer address, length and period (in case of
|
||||
dmaengine_prep_dma_cyclic()) with the new SRAM buffer.
|
||||
|
||||
**3. Get a descriptor for STM32 MDMA channel transaction**
|
||||
|
||||
If you previously get descriptor (for STM32 DMA) with
|
||||
|
||||
* dmaengine_prep_slave_sg(), then use dmaengine_prep_slave_sg() for
|
||||
STM32 MDMA;
|
||||
* dmaengine_prep_dma_cyclic(), then use dmaengine_prep_dma_cyclic() for
|
||||
STM32 MDMA.
|
||||
|
||||
Use the new sg_list using SRAM buffer (in case of dmaengine_prep_slave_sg())
|
||||
or, depending on the transfer direction, either the original DDR buffer (in
|
||||
case of DMA_DEV_TO_MEM) or the SRAM buffer (in case of DMA_MEM_TO_DEV), the
|
||||
source address being previously set with dmaengine_slave_config().
|
||||
|
||||
**4. Submit both transactions**
|
||||
|
||||
Before submitting your transactions, you may need to define on which
|
||||
descriptor you want a callback to be called at the end of the transfer
|
||||
(dmaengine_prep_slave_sg()) or the period (dmaengine_prep_dma_cyclic()).
|
||||
Depending on the direction, set the callback on the descriptor that finishes
|
||||
the overal transfer:
|
||||
|
||||
* DMA_DEV_TO_MEM: set the callback on the "MDMA" descriptor
|
||||
* DMA_MEM_TO_DEV: set the callback on the "DMA" descriptor
|
||||
|
||||
Then, submit the descriptors whatever the order, with dmaengine_tx_submit().
|
||||
|
||||
**5. Issue pending requests (and wait for callback notification)**
|
||||
|
||||
As STM32 MDMA channel transfer is triggered by STM32 DMA, you must issue
|
||||
STM32 MDMA channel before STM32 DMA channel.
|
||||
|
||||
If any, your callback will be called to warn you about the end of the overal
|
||||
transfer or the period completion.
|
||||
|
||||
Don't forget to terminate both channels. STM32 DMA channel is configured in
|
||||
cyclic Double-Buffer mode so it won't be disabled by HW, you need to terminate
|
||||
it. STM32 MDMA channel will be stopped by HW in case of sg transfer, but not
|
||||
in case of cyclic transfer. You can terminate it whatever the kind of transfer.
|
||||
|
||||
**STM32 DMA-MDMA chaining DMA_MEM_TO_DEV special case**
|
||||
|
||||
STM32 DMA-MDMA chaining in DMA_MEM_TO_DEV is a special case. Indeed, the
|
||||
STM32 MDMA feeds the SRAM buffer with the DDR data, and the STM32 DMA reads
|
||||
data from SRAM buffer. So some data (the first period) have to be copied in
|
||||
SRAM buffer when the STM32 DMA starts to read.
|
||||
|
||||
A trick could be pausing the STM32 DMA channel (that will raise a Transfer
|
||||
Complete signal, triggering the STM32 MDMA channel), but the first data read
|
||||
by the STM32 DMA could be "wrong". The proper way is to prepare the first SRAM
|
||||
period with dmaengine_prep_dma_memcpy(). Then this first period should be
|
||||
"removed" from the sg or the cyclic transfer.
|
||||
|
||||
Due to this complexity, rather use the STM32 DMA-MDMA chaining for
|
||||
DMA_DEV_TO_MEM and keep the "classic" DMA usage for DMA_MEM_TO_DEV, unless
|
||||
you're not afraid.
|
||||
|
||||
Resources
|
||||
---------
|
||||
|
||||
Application note, datasheet and reference manual are available on ST website
|
||||
(STM32MP1_).
|
||||
|
||||
Dedicated focus on three application notes (AN5224_, AN4031_ & AN5001_)
|
||||
dealing with STM32 DMAMUX, STM32 DMA and STM32 MDMA.
|
||||
|
||||
.. _STM32MP1: https://www.st.com/en/microcontrollers-microprocessors/stm32mp1-series.html
|
||||
.. _AN5224: https://www.st.com/resource/en/application_note/an5224-stm32-dmamux-the-dma-request-router-stmicroelectronics.pdf
|
||||
.. _AN4031: https://www.st.com/resource/en/application_note/dm00046011-using-the-stm32f2-stm32f4-and-stm32f7-series-dma-controller-stmicroelectronics.pdf
|
||||
.. _AN5001: https://www.st.com/resource/en/application_note/an5001-stm32cube-expansion-package-for-stm32h7-series-mdma-stmicroelectronics.pdf
|
||||
|
||||
:Authors:
|
||||
|
||||
- Amelie Delaunay <amelie.delaunay@foss.st.com>
|
||||
@@ -65,10 +65,6 @@ linux,uefi-mmap-desc-size 32-bit Size in bytes of each entry in the UEFI
|
||||
|
||||
linux,uefi-mmap-desc-ver 32-bit Version of the mmap descriptor format.
|
||||
|
||||
linux,initrd-start 64-bit Physical start address of an initrd
|
||||
|
||||
linux,initrd-end 64-bit Physical end address of an initrd
|
||||
|
||||
kaslr-seed 64-bit Entropy used to randomize the kernel image
|
||||
base address location.
|
||||
========================== ====== ===========================================
|
||||
|
||||
@@ -242,46 +242,39 @@ HWCAP2_MTE3
|
||||
by Documentation/arm64/memory-tagging-extension.rst.
|
||||
|
||||
HWCAP2_SME
|
||||
|
||||
Functionality implied by ID_AA64PFR1_EL1.SME == 0b0001, as described
|
||||
by Documentation/arm64/sme.rst.
|
||||
|
||||
HWCAP2_SME_I16I64
|
||||
|
||||
Functionality implied by ID_AA64SMFR0_EL1.I16I64 == 0b1111.
|
||||
|
||||
HWCAP2_SME_F64F64
|
||||
|
||||
Functionality implied by ID_AA64SMFR0_EL1.F64F64 == 0b1.
|
||||
|
||||
HWCAP2_SME_I8I32
|
||||
|
||||
Functionality implied by ID_AA64SMFR0_EL1.I8I32 == 0b1111.
|
||||
|
||||
HWCAP2_SME_F16F32
|
||||
|
||||
Functionality implied by ID_AA64SMFR0_EL1.F16F32 == 0b1.
|
||||
|
||||
HWCAP2_SME_B16F32
|
||||
|
||||
Functionality implied by ID_AA64SMFR0_EL1.B16F32 == 0b1.
|
||||
|
||||
HWCAP2_SME_F32F32
|
||||
|
||||
Functionality implied by ID_AA64SMFR0_EL1.F32F32 == 0b1.
|
||||
|
||||
HWCAP2_SME_FA64
|
||||
|
||||
Functionality implied by ID_AA64SMFR0_EL1.FA64 == 0b1.
|
||||
|
||||
HWCAP2_WFXT
|
||||
|
||||
Functionality implied by ID_AA64ISAR2_EL1.WFXT == 0b0010.
|
||||
|
||||
HWCAP2_EBF16
|
||||
|
||||
Functionality implied by ID_AA64ISAR1_EL1.BF16 == 0b0010.
|
||||
|
||||
HWCAP2_SVE_EBF16
|
||||
Functionality implied by ID_AA64ZFR0_EL1.BF16 == 0b0010.
|
||||
|
||||
4. Unused AT_HWCAP bits
|
||||
-----------------------
|
||||
|
||||
|
||||
@@ -52,6 +52,8 @@ stable kernels.
|
||||
| Allwinner | A64/R18 | UNKNOWN1 | SUN50I_ERRATUM_UNKNOWN1 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A510 | #2457168 | ARM64_ERRATUM_2457168 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A510 | #2064142 | ARM64_ERRATUM_2064142 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A510 | #2038923 | ARM64_ERRATUM_2038923 |
|
||||
@@ -74,6 +76,8 @@ stable kernels.
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A55 | #1530923 | ARM64_ERRATUM_1530923 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A55 | #2441007 | ARM64_ERRATUM_2441007 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A57 | #832075 | ARM64_ERRATUM_832075 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A57 | #852523 | N/A |
|
||||
@@ -108,6 +112,8 @@ stable kernels.
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A510 | #2441009 | ARM64_ERRATUM_2441009 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A510 | #2658417 | ARM64_ERRATUM_2658417 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A710 | #2119858 | ARM64_ERRATUM_2119858 |
|
||||
+----------------+-----------------+-----------------+-----------------------------+
|
||||
| ARM | Cortex-A710 | #2054223 | ARM64_ERRATUM_2054223 |
|
||||
|
||||
@@ -331,6 +331,9 @@ The regset data starts with struct user_za_header, containing:
|
||||
been read if a PTRACE_GETREGSET of NT_ARM_ZA were executed for each thread
|
||||
when the coredump was generated.
|
||||
|
||||
* The NT_ARM_TLS note will be extended to two registers, the second register
|
||||
will contain TPIDR2_EL0 on systems that support SME and will be read as
|
||||
zero with writes ignored otherwise.
|
||||
|
||||
9. System runtime configuration
|
||||
--------------------------------
|
||||
|
||||
@@ -111,7 +111,7 @@ the SVE instruction set architecture.
|
||||
|
||||
* On syscall, V0..V31 are preserved (as without SVE). Thus, bits [127:0] of
|
||||
Z0..Z31 are preserved. All other bits of Z0..Z31, and all of P0..P15 and FFR
|
||||
become unspecified on return from a syscall.
|
||||
become zero on return from a syscall.
|
||||
|
||||
* The SVE registers are not used to pass arguments to or receive results from
|
||||
any syscall.
|
||||
@@ -452,6 +452,24 @@ The regset data starts with struct user_sve_header, containing:
|
||||
* Modifying the system default vector length does not affect the vector length
|
||||
of any existing process or thread that does not make an execve() call.
|
||||
|
||||
10. Perf extensions
|
||||
--------------------------------
|
||||
|
||||
* The arm64 specific DWARF standard [5] added the VG (Vector Granule) register
|
||||
at index 46. This register is used for DWARF unwinding when variable length
|
||||
SVE registers are pushed onto the stack.
|
||||
|
||||
* Its value is equivalent to the current SVE vector length (VL) in bits divided
|
||||
by 64.
|
||||
|
||||
* The value is included in Perf samples in the regs[46] field if
|
||||
PERF_SAMPLE_REGS_USER is set and the sample_regs_user mask has bit 46 set.
|
||||
|
||||
* The value is the current value at the time the sample was taken, and it can
|
||||
change over time.
|
||||
|
||||
* If the system doesn't support SVE when perf_event_open is called with these
|
||||
settings, the event will fail to open.
|
||||
|
||||
Appendix A. SVE programmer's model (informative)
|
||||
=================================================
|
||||
@@ -593,3 +611,5 @@ References
|
||||
http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055c/IHI0055C_beta_aapcs64.pdf
|
||||
http://infocenter.arm.com/help/topic/com.arm.doc.subset.swdev.abi/index.html
|
||||
Procedure Call Standard for the ARM 64-bit Architecture (AArch64)
|
||||
|
||||
[5] https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst
|
||||
|
||||
@@ -58,13 +58,11 @@ Like with atomic_t, the rule of thumb is:
|
||||
|
||||
- RMW operations that have a return value are fully ordered.
|
||||
|
||||
- RMW operations that are conditional are unordered on FAILURE,
|
||||
otherwise the above rules apply. In the case of test_and_set_bit_lock(),
|
||||
if the bit in memory is unchanged by the operation then it is deemed to have
|
||||
failed.
|
||||
- RMW operations that are conditional are fully ordered.
|
||||
|
||||
Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and
|
||||
clear_bit_unlock() which has RELEASE semantics.
|
||||
Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics,
|
||||
clear_bit_unlock() which has RELEASE semantics and test_bit_acquire which has
|
||||
ACQUIRE semantics.
|
||||
|
||||
Since a platform only has a single means of achieving atomic operations
|
||||
the same barriers as for atomic_t are used, see atomic_t.txt.
|
||||
|
||||
@@ -23,3 +23,4 @@ Block
|
||||
stat
|
||||
switching-sched
|
||||
writeback_cache_control
|
||||
ublk
|
||||
|
||||
253
Documentation/block/ublk.rst
Normal file
253
Documentation/block/ublk.rst
Normal file
@@ -0,0 +1,253 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===========================================
|
||||
Userspace block device driver (ublk driver)
|
||||
===========================================
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
ublk is a generic framework for implementing block device logic from userspace.
|
||||
The motivation behind it is that moving virtual block drivers into userspace,
|
||||
such as loop, nbd and similar can be very helpful. It can help to implement
|
||||
new virtual block device such as ublk-qcow2 (there are several attempts of
|
||||
implementing qcow2 driver in kernel).
|
||||
|
||||
Userspace block devices are attractive because:
|
||||
|
||||
- They can be written many programming languages.
|
||||
- They can use libraries that are not available in the kernel.
|
||||
- They can be debugged with tools familiar to application developers.
|
||||
- Crashes do not kernel panic the machine.
|
||||
- Bugs are likely to have a lower security impact than bugs in kernel
|
||||
code.
|
||||
- They can be installed and updated independently of the kernel.
|
||||
- They can be used to simulate block device easily with user specified
|
||||
parameters/setting for test/debug purpose
|
||||
|
||||
ublk block device (``/dev/ublkb*``) is added by ublk driver. Any IO request
|
||||
on the device will be forwarded to ublk userspace program. For convenience,
|
||||
in this document, ``ublk server`` refers to generic ublk userspace
|
||||
program. ``ublksrv`` [#userspace]_ is one of such implementation. It
|
||||
provides ``libublksrv`` [#userspace_lib]_ library for developing specific
|
||||
user block device conveniently, while also generic type block device is
|
||||
included, such as loop and null. Richard W.M. Jones wrote userspace nbd device
|
||||
``nbdublk`` [#userspace_nbdublk]_ based on ``libublksrv`` [#userspace_lib]_.
|
||||
|
||||
After the IO is handled by userspace, the result is committed back to the
|
||||
driver, thus completing the request cycle. This way, any specific IO handling
|
||||
logic is totally done by userspace, such as loop's IO handling, NBD's IO
|
||||
communication, or qcow2's IO mapping.
|
||||
|
||||
``/dev/ublkb*`` is driven by blk-mq request-based driver. Each request is
|
||||
assigned by one queue wide unique tag. ublk server assigns unique tag to each
|
||||
IO too, which is 1:1 mapped with IO of ``/dev/ublkb*``.
|
||||
|
||||
Both the IO request forward and IO handling result committing are done via
|
||||
``io_uring`` passthrough command; that is why ublk is also one io_uring based
|
||||
block driver. It has been observed that using io_uring passthrough command can
|
||||
give better IOPS than block IO; which is why ublk is one of high performance
|
||||
implementation of userspace block device: not only IO request communication is
|
||||
done by io_uring, but also the preferred IO handling in ublk server is io_uring
|
||||
based approach too.
|
||||
|
||||
ublk provides control interface to set/get ublk block device parameters.
|
||||
The interface is extendable and kabi compatible: basically any ublk request
|
||||
queue's parameter or ublk generic feature parameters can be set/get via the
|
||||
interface. Thus, ublk is generic userspace block device framework.
|
||||
For example, it is easy to setup a ublk device with specified block
|
||||
parameters from userspace.
|
||||
|
||||
Using ublk
|
||||
==========
|
||||
|
||||
ublk requires userspace ublk server to handle real block device logic.
|
||||
|
||||
Below is example of using ``ublksrv`` to provide ublk-based loop device.
|
||||
|
||||
- add a device::
|
||||
|
||||
ublk add -t loop -f ublk-loop.img
|
||||
|
||||
- format with xfs, then use it::
|
||||
|
||||
mkfs.xfs /dev/ublkb0
|
||||
mount /dev/ublkb0 /mnt
|
||||
# do anything. all IOs are handled by io_uring
|
||||
...
|
||||
umount /mnt
|
||||
|
||||
- list the devices with their info::
|
||||
|
||||
ublk list
|
||||
|
||||
- delete the device::
|
||||
|
||||
ublk del -a
|
||||
ublk del -n $ublk_dev_id
|
||||
|
||||
See usage details in README of ``ublksrv`` [#userspace_readme]_.
|
||||
|
||||
Design
|
||||
======
|
||||
|
||||
Control plane
|
||||
-------------
|
||||
|
||||
ublk driver provides global misc device node (``/dev/ublk-control``) for
|
||||
managing and controlling ublk devices with help of several control commands:
|
||||
|
||||
- ``UBLK_CMD_ADD_DEV``
|
||||
|
||||
Add a ublk char device (``/dev/ublkc*``) which is talked with ublk server
|
||||
WRT IO command communication. Basic device info is sent together with this
|
||||
command. It sets UAPI structure of ``ublksrv_ctrl_dev_info``,
|
||||
such as ``nr_hw_queues``, ``queue_depth``, and max IO request buffer size,
|
||||
for which the info is negotiated with the driver and sent back to the server.
|
||||
When this command is completed, the basic device info is immutable.
|
||||
|
||||
- ``UBLK_CMD_SET_PARAMS`` / ``UBLK_CMD_GET_PARAMS``
|
||||
|
||||
Set or get parameters of the device, which can be either generic feature
|
||||
related, or request queue limit related, but can't be IO logic specific,
|
||||
because the driver does not handle any IO logic. This command has to be
|
||||
sent before sending ``UBLK_CMD_START_DEV``.
|
||||
|
||||
- ``UBLK_CMD_START_DEV``
|
||||
|
||||
After the server prepares userspace resources (such as creating per-queue
|
||||
pthread & io_uring for handling ublk IO), this command is sent to the
|
||||
driver for allocating & exposing ``/dev/ublkb*``. Parameters set via
|
||||
``UBLK_CMD_SET_PARAMS`` are applied for creating the device.
|
||||
|
||||
- ``UBLK_CMD_STOP_DEV``
|
||||
|
||||
Halt IO on ``/dev/ublkb*`` and remove the device. When this command returns,
|
||||
ublk server will release resources (such as destroying per-queue pthread &
|
||||
io_uring).
|
||||
|
||||
- ``UBLK_CMD_DEL_DEV``
|
||||
|
||||
Remove ``/dev/ublkc*``. When this command returns, the allocated ublk device
|
||||
number can be reused.
|
||||
|
||||
- ``UBLK_CMD_GET_QUEUE_AFFINITY``
|
||||
|
||||
When ``/dev/ublkc`` is added, the driver creates block layer tagset, so
|
||||
that each queue's affinity info is available. The server sends
|
||||
``UBLK_CMD_GET_QUEUE_AFFINITY`` to retrieve queue affinity info. It can
|
||||
set up the per-queue context efficiently, such as bind affine CPUs with IO
|
||||
pthread and try to allocate buffers in IO thread context.
|
||||
|
||||
- ``UBLK_CMD_GET_DEV_INFO``
|
||||
|
||||
For retrieving device info via ``ublksrv_ctrl_dev_info``. It is the server's
|
||||
responsibility to save IO target specific info in userspace.
|
||||
|
||||
Data plane
|
||||
----------
|
||||
|
||||
ublk server needs to create per-queue IO pthread & io_uring for handling IO
|
||||
commands via io_uring passthrough. The per-queue IO pthread
|
||||
focuses on IO handling and shouldn't handle any control & management
|
||||
tasks.
|
||||
|
||||
The's IO is assigned by a unique tag, which is 1:1 mapping with IO
|
||||
request of ``/dev/ublkb*``.
|
||||
|
||||
UAPI structure of ``ublksrv_io_desc`` is defined for describing each IO from
|
||||
the driver. A fixed mmaped area (array) on ``/dev/ublkc*`` is provided for
|
||||
exporting IO info to the server; such as IO offset, length, OP/flags and
|
||||
buffer address. Each ``ublksrv_io_desc`` instance can be indexed via queue id
|
||||
and IO tag directly.
|
||||
|
||||
The following IO commands are communicated via io_uring passthrough command,
|
||||
and each command is only for forwarding the IO and committing the result
|
||||
with specified IO tag in the command data:
|
||||
|
||||
- ``UBLK_IO_FETCH_REQ``
|
||||
|
||||
Sent from the server IO pthread for fetching future incoming IO requests
|
||||
destined to ``/dev/ublkb*``. This command is sent only once from the server
|
||||
IO pthread for ublk driver to setup IO forward environment.
|
||||
|
||||
- ``UBLK_IO_COMMIT_AND_FETCH_REQ``
|
||||
|
||||
When an IO request is destined to ``/dev/ublkb*``, the driver stores
|
||||
the IO's ``ublksrv_io_desc`` to the specified mapped area; then the
|
||||
previous received IO command of this IO tag (either ``UBLK_IO_FETCH_REQ``
|
||||
or ``UBLK_IO_COMMIT_AND_FETCH_REQ)`` is completed, so the server gets
|
||||
the IO notification via io_uring.
|
||||
|
||||
After the server handles the IO, its result is committed back to the
|
||||
driver by sending ``UBLK_IO_COMMIT_AND_FETCH_REQ`` back. Once ublkdrv
|
||||
received this command, it parses the result and complete the request to
|
||||
``/dev/ublkb*``. In the meantime setup environment for fetching future
|
||||
requests with the same IO tag. That is, ``UBLK_IO_COMMIT_AND_FETCH_REQ``
|
||||
is reused for both fetching request and committing back IO result.
|
||||
|
||||
- ``UBLK_IO_NEED_GET_DATA``
|
||||
|
||||
With ``UBLK_F_NEED_GET_DATA`` enabled, the WRITE request will be firstly
|
||||
issued to ublk server without data copy. Then, IO backend of ublk server
|
||||
receives the request and it can allocate data buffer and embed its addr
|
||||
inside this new io command. After the kernel driver gets the command,
|
||||
data copy is done from request pages to this backend's buffer. Finally,
|
||||
backend receives the request again with data to be written and it can
|
||||
truly handle the request.
|
||||
|
||||
``UBLK_IO_NEED_GET_DATA`` adds one additional round-trip and one
|
||||
io_uring_enter() syscall. Any user thinks that it may lower performance
|
||||
should not enable UBLK_F_NEED_GET_DATA. ublk server pre-allocates IO
|
||||
buffer for each IO by default. Any new project should try to use this
|
||||
buffer to communicate with ublk driver. However, existing project may
|
||||
break or not able to consume the new buffer interface; that's why this
|
||||
command is added for backwards compatibility so that existing projects
|
||||
can still consume existing buffers.
|
||||
|
||||
- data copy between ublk server IO buffer and ublk block IO request
|
||||
|
||||
The driver needs to copy the block IO request pages into the server buffer
|
||||
(pages) first for WRITE before notifying the server of the coming IO, so
|
||||
that the server can handle WRITE request.
|
||||
|
||||
When the server handles READ request and sends
|
||||
``UBLK_IO_COMMIT_AND_FETCH_REQ`` to the server, ublkdrv needs to copy
|
||||
the server buffer (pages) read to the IO request pages.
|
||||
|
||||
Future development
|
||||
==================
|
||||
|
||||
Container-aware ublk deivice
|
||||
----------------------------
|
||||
|
||||
ublk driver doesn't handle any IO logic. Its function is well defined
|
||||
for now and very limited userspace interfaces are needed, which is also
|
||||
well defined too. It is possible to make ublk devices container-aware block
|
||||
devices in future as Stefan Hajnoczi suggested [#stefan]_, by removing
|
||||
ADMIN privilege.
|
||||
|
||||
Zero copy
|
||||
---------
|
||||
|
||||
Zero copy is a generic requirement for nbd, fuse or similar drivers. A
|
||||
problem [#xiaoguang]_ Xiaoguang mentioned is that pages mapped to userspace
|
||||
can't be remapped any more in kernel with existing mm interfaces. This can
|
||||
occurs when destining direct IO to ``/dev/ublkb*``. Also, he reported that
|
||||
big requests (IO size >= 256 KB) may benefit a lot from zero copy.
|
||||
|
||||
|
||||
References
|
||||
==========
|
||||
|
||||
.. [#userspace] https://github.com/ming1/ubdsrv
|
||||
|
||||
.. [#userspace_lib] https://github.com/ming1/ubdsrv/tree/master/lib
|
||||
|
||||
.. [#userspace_nbdublk] https://gitlab.com/rwmjones/libnbd/-/tree/nbdublk
|
||||
|
||||
.. [#userspace_readme] https://github.com/ming1/ubdsrv/blob/master/README
|
||||
|
||||
.. [#stefan] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/
|
||||
|
||||
.. [#xiaoguang] https://lore.kernel.org/linux-block/YoOr6jBfgVm8GvWg@stefanha-x1.localdomain/
|
||||
30
Documentation/bpf/clang-notes.rst
Normal file
30
Documentation/bpf/clang-notes.rst
Normal file
@@ -0,0 +1,30 @@
|
||||
.. contents::
|
||||
.. sectnum::
|
||||
|
||||
==========================
|
||||
Clang implementation notes
|
||||
==========================
|
||||
|
||||
This document provides more details specific to the Clang/LLVM implementation of the eBPF instruction set.
|
||||
|
||||
Versions
|
||||
========
|
||||
|
||||
Clang defined "CPU" versions, where a CPU version of 3 corresponds to the current eBPF ISA.
|
||||
|
||||
Clang can select the eBPF ISA version using ``-mcpu=v3`` for example to select version 3.
|
||||
|
||||
Arithmetic instructions
|
||||
=======================
|
||||
|
||||
For CPU versions prior to 3, Clang v7.0 and later can enable ``BPF_ALU`` support with
|
||||
``-Xclang -target-feature -Xclang +alu32``. In CPU version 3, support is automatically included.
|
||||
|
||||
Atomic operations
|
||||
=================
|
||||
|
||||
Clang can generate atomic instructions by default when ``-mcpu=v3`` is
|
||||
enabled. If a lower version for ``-mcpu`` is set, the only atomic instruction
|
||||
Clang can generate is ``BPF_ADD`` *without* ``BPF_FETCH``. If you need to enable
|
||||
the atomics features, while keeping a lower ``-mcpu`` version, you can use
|
||||
``-Xclang -target-feature -Xclang +alu32``.
|
||||
@@ -26,6 +26,8 @@ that goes into great technical depth about the BPF Architecture.
|
||||
classic_vs_extended.rst
|
||||
bpf_licensing
|
||||
test_debug
|
||||
clang-notes
|
||||
linux-notes
|
||||
other
|
||||
|
||||
.. only:: subproject and html
|
||||
|
||||
@@ -1,7 +1,12 @@
|
||||
.. contents::
|
||||
.. sectnum::
|
||||
|
||||
========================================
|
||||
eBPF Instruction Set Specification, v1.0
|
||||
========================================
|
||||
|
||||
This document specifies version 1.0 of the eBPF instruction set.
|
||||
|
||||
====================
|
||||
eBPF Instruction Set
|
||||
====================
|
||||
|
||||
Registers and calling convention
|
||||
================================
|
||||
@@ -11,10 +16,10 @@ all of which are 64-bits wide.
|
||||
|
||||
The eBPF calling convention is defined as:
|
||||
|
||||
* R0: return value from function calls, and exit value for eBPF programs
|
||||
* R1 - R5: arguments for function calls
|
||||
* R6 - R9: callee saved registers that function calls will preserve
|
||||
* R10: read-only frame pointer to access stack
|
||||
* R0: return value from function calls, and exit value for eBPF programs
|
||||
* R1 - R5: arguments for function calls
|
||||
* R6 - R9: callee saved registers that function calls will preserve
|
||||
* R10: read-only frame pointer to access stack
|
||||
|
||||
R0 - R5 are scratch registers and eBPF programs needs to spill/fill them if
|
||||
necessary across calls.
|
||||
@@ -24,17 +29,17 @@ Instruction encoding
|
||||
|
||||
eBPF has two instruction encodings:
|
||||
|
||||
* the basic instruction encoding, which uses 64 bits to encode an instruction
|
||||
* the wide instruction encoding, which appends a second 64-bit immediate value
|
||||
(imm64) after the basic instruction for a total of 128 bits.
|
||||
* the basic instruction encoding, which uses 64 bits to encode an instruction
|
||||
* the wide instruction encoding, which appends a second 64-bit immediate value
|
||||
(imm64) after the basic instruction for a total of 128 bits.
|
||||
|
||||
The basic instruction encoding looks as follows:
|
||||
|
||||
============= ======= =============== ==================== ============
|
||||
32 bits (MSB) 16 bits 4 bits 4 bits 8 bits (LSB)
|
||||
============= ======= =============== ==================== ============
|
||||
immediate offset source register destination register opcode
|
||||
============= ======= =============== ==================== ============
|
||||
============= ======= =============== ==================== ============
|
||||
32 bits (MSB) 16 bits 4 bits 4 bits 8 bits (LSB)
|
||||
============= ======= =============== ==================== ============
|
||||
immediate offset source register destination register opcode
|
||||
============= ======= =============== ==================== ============
|
||||
|
||||
Note that most instructions do not use all of the fields.
|
||||
Unused fields shall be cleared to zero.
|
||||
@@ -44,30 +49,30 @@ Instruction classes
|
||||
|
||||
The three LSB bits of the 'opcode' field store the instruction class:
|
||||
|
||||
========= ===== ===============================
|
||||
class value description
|
||||
========= ===== ===============================
|
||||
BPF_LD 0x00 non-standard load operations
|
||||
BPF_LDX 0x01 load into register operations
|
||||
BPF_ST 0x02 store from immediate operations
|
||||
BPF_STX 0x03 store from register operations
|
||||
BPF_ALU 0x04 32-bit arithmetic operations
|
||||
BPF_JMP 0x05 64-bit jump operations
|
||||
BPF_JMP32 0x06 32-bit jump operations
|
||||
BPF_ALU64 0x07 64-bit arithmetic operations
|
||||
========= ===== ===============================
|
||||
========= ===== =============================== ===================================
|
||||
class value description reference
|
||||
========= ===== =============================== ===================================
|
||||
BPF_LD 0x00 non-standard load operations `Load and store instructions`_
|
||||
BPF_LDX 0x01 load into register operations `Load and store instructions`_
|
||||
BPF_ST 0x02 store from immediate operations `Load and store instructions`_
|
||||
BPF_STX 0x03 store from register operations `Load and store instructions`_
|
||||
BPF_ALU 0x04 32-bit arithmetic operations `Arithmetic and jump instructions`_
|
||||
BPF_JMP 0x05 64-bit jump operations `Arithmetic and jump instructions`_
|
||||
BPF_JMP32 0x06 32-bit jump operations `Arithmetic and jump instructions`_
|
||||
BPF_ALU64 0x07 64-bit arithmetic operations `Arithmetic and jump instructions`_
|
||||
========= ===== =============================== ===================================
|
||||
|
||||
Arithmetic and jump instructions
|
||||
================================
|
||||
|
||||
For arithmetic and jump instructions (BPF_ALU, BPF_ALU64, BPF_JMP and
|
||||
BPF_JMP32), the 8-bit 'opcode' field is divided into three parts:
|
||||
For arithmetic and jump instructions (``BPF_ALU``, ``BPF_ALU64``, ``BPF_JMP`` and
|
||||
``BPF_JMP32``), the 8-bit 'opcode' field is divided into three parts:
|
||||
|
||||
============== ====== =================
|
||||
4 bits (MSB) 1 bit 3 bits (LSB)
|
||||
============== ====== =================
|
||||
operation code source instruction class
|
||||
============== ====== =================
|
||||
============== ====== =================
|
||||
4 bits (MSB) 1 bit 3 bits (LSB)
|
||||
============== ====== =================
|
||||
operation code source instruction class
|
||||
============== ====== =================
|
||||
|
||||
The 4th bit encodes the source operand:
|
||||
|
||||
@@ -84,66 +89,66 @@ The four MSB bits store the operation code.
|
||||
Arithmetic instructions
|
||||
-----------------------
|
||||
|
||||
BPF_ALU uses 32-bit wide operands while BPF_ALU64 uses 64-bit wide operands for
|
||||
``BPF_ALU`` uses 32-bit wide operands while ``BPF_ALU64`` uses 64-bit wide operands for
|
||||
otherwise identical operations.
|
||||
The code field encodes the operation as below:
|
||||
The 'code' field encodes the operation as below:
|
||||
|
||||
======== ===== =================================================
|
||||
code value description
|
||||
======== ===== =================================================
|
||||
BPF_ADD 0x00 dst += src
|
||||
BPF_SUB 0x10 dst -= src
|
||||
BPF_MUL 0x20 dst \*= src
|
||||
BPF_DIV 0x30 dst /= src
|
||||
BPF_OR 0x40 dst \|= src
|
||||
BPF_AND 0x50 dst &= src
|
||||
BPF_LSH 0x60 dst <<= src
|
||||
BPF_RSH 0x70 dst >>= src
|
||||
BPF_NEG 0x80 dst = ~src
|
||||
BPF_MOD 0x90 dst %= src
|
||||
BPF_XOR 0xa0 dst ^= src
|
||||
BPF_MOV 0xb0 dst = src
|
||||
BPF_ARSH 0xc0 sign extending shift right
|
||||
BPF_END 0xd0 byte swap operations (see separate section below)
|
||||
======== ===== =================================================
|
||||
======== ===== ==========================================================
|
||||
code value description
|
||||
======== ===== ==========================================================
|
||||
BPF_ADD 0x00 dst += src
|
||||
BPF_SUB 0x10 dst -= src
|
||||
BPF_MUL 0x20 dst \*= src
|
||||
BPF_DIV 0x30 dst /= src
|
||||
BPF_OR 0x40 dst \|= src
|
||||
BPF_AND 0x50 dst &= src
|
||||
BPF_LSH 0x60 dst <<= src
|
||||
BPF_RSH 0x70 dst >>= src
|
||||
BPF_NEG 0x80 dst = ~src
|
||||
BPF_MOD 0x90 dst %= src
|
||||
BPF_XOR 0xa0 dst ^= src
|
||||
BPF_MOV 0xb0 dst = src
|
||||
BPF_ARSH 0xc0 sign extending shift right
|
||||
BPF_END 0xd0 byte swap operations (see `Byte swap instructions`_ below)
|
||||
======== ===== ==========================================================
|
||||
|
||||
BPF_ADD | BPF_X | BPF_ALU means::
|
||||
``BPF_ADD | BPF_X | BPF_ALU`` means::
|
||||
|
||||
dst_reg = (u32) dst_reg + (u32) src_reg;
|
||||
|
||||
BPF_ADD | BPF_X | BPF_ALU64 means::
|
||||
``BPF_ADD | BPF_X | BPF_ALU64`` means::
|
||||
|
||||
dst_reg = dst_reg + src_reg
|
||||
|
||||
BPF_XOR | BPF_K | BPF_ALU means::
|
||||
``BPF_XOR | BPF_K | BPF_ALU`` means::
|
||||
|
||||
src_reg = (u32) src_reg ^ (u32) imm32
|
||||
|
||||
BPF_XOR | BPF_K | BPF_ALU64 means::
|
||||
``BPF_XOR | BPF_K | BPF_ALU64`` means::
|
||||
|
||||
src_reg = src_reg ^ imm32
|
||||
|
||||
|
||||
Byte swap instructions
|
||||
----------------------
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The byte swap instructions use an instruction class of ``BPF_ALU`` and a 4-bit
|
||||
code field of ``BPF_END``.
|
||||
'code' field of ``BPF_END``.
|
||||
|
||||
The byte swap instructions operate on the destination register
|
||||
only and do not use a separate source register or immediate value.
|
||||
|
||||
The 1-bit source operand field in the opcode is used to to select what byte
|
||||
The 1-bit source operand field in the opcode is used to select what byte
|
||||
order the operation convert from or to:
|
||||
|
||||
========= ===== =================================================
|
||||
source value description
|
||||
========= ===== =================================================
|
||||
BPF_TO_LE 0x00 convert between host byte order and little endian
|
||||
BPF_TO_BE 0x08 convert between host byte order and big endian
|
||||
========= ===== =================================================
|
||||
========= ===== =================================================
|
||||
source value description
|
||||
========= ===== =================================================
|
||||
BPF_TO_LE 0x00 convert between host byte order and little endian
|
||||
BPF_TO_BE 0x08 convert between host byte order and big endian
|
||||
========= ===== =================================================
|
||||
|
||||
The imm field encodes the width of the swap operations. The following widths
|
||||
The 'imm' field encodes the width of the swap operations. The following widths
|
||||
are supported: 16, 32 and 64.
|
||||
|
||||
Examples:
|
||||
@@ -156,35 +161,31 @@ Examples:
|
||||
|
||||
dst_reg = htobe64(dst_reg)
|
||||
|
||||
``BPF_FROM_LE`` and ``BPF_FROM_BE`` exist as aliases for ``BPF_TO_LE`` and
|
||||
``BPF_TO_BE`` respectively.
|
||||
|
||||
|
||||
Jump instructions
|
||||
-----------------
|
||||
|
||||
BPF_JMP32 uses 32-bit wide operands while BPF_JMP uses 64-bit wide operands for
|
||||
``BPF_JMP32`` uses 32-bit wide operands while ``BPF_JMP`` uses 64-bit wide operands for
|
||||
otherwise identical operations.
|
||||
The code field encodes the operation as below:
|
||||
The 'code' field encodes the operation as below:
|
||||
|
||||
======== ===== ========================= ============
|
||||
code value description notes
|
||||
======== ===== ========================= ============
|
||||
BPF_JA 0x00 PC += off BPF_JMP only
|
||||
BPF_JEQ 0x10 PC += off if dst == src
|
||||
BPF_JGT 0x20 PC += off if dst > src unsigned
|
||||
BPF_JGE 0x30 PC += off if dst >= src unsigned
|
||||
BPF_JSET 0x40 PC += off if dst & src
|
||||
BPF_JNE 0x50 PC += off if dst != src
|
||||
BPF_JSGT 0x60 PC += off if dst > src signed
|
||||
BPF_JSGE 0x70 PC += off if dst >= src signed
|
||||
BPF_CALL 0x80 function call
|
||||
BPF_EXIT 0x90 function / program return BPF_JMP only
|
||||
BPF_JLT 0xa0 PC += off if dst < src unsigned
|
||||
BPF_JLE 0xb0 PC += off if dst <= src unsigned
|
||||
BPF_JSLT 0xc0 PC += off if dst < src signed
|
||||
BPF_JSLE 0xd0 PC += off if dst <= src signed
|
||||
======== ===== ========================= ============
|
||||
======== ===== ========================= ============
|
||||
code value description notes
|
||||
======== ===== ========================= ============
|
||||
BPF_JA 0x00 PC += off BPF_JMP only
|
||||
BPF_JEQ 0x10 PC += off if dst == src
|
||||
BPF_JGT 0x20 PC += off if dst > src unsigned
|
||||
BPF_JGE 0x30 PC += off if dst >= src unsigned
|
||||
BPF_JSET 0x40 PC += off if dst & src
|
||||
BPF_JNE 0x50 PC += off if dst != src
|
||||
BPF_JSGT 0x60 PC += off if dst > src signed
|
||||
BPF_JSGE 0x70 PC += off if dst >= src signed
|
||||
BPF_CALL 0x80 function call
|
||||
BPF_EXIT 0x90 function / program return BPF_JMP only
|
||||
BPF_JLT 0xa0 PC += off if dst < src unsigned
|
||||
BPF_JLE 0xb0 PC += off if dst <= src unsigned
|
||||
BPF_JSLT 0xc0 PC += off if dst < src signed
|
||||
BPF_JSLE 0xd0 PC += off if dst <= src signed
|
||||
======== ===== ========================= ============
|
||||
|
||||
The eBPF program needs to store the return value into register R0 before doing a
|
||||
BPF_EXIT.
|
||||
@@ -193,14 +194,26 @@ BPF_EXIT.
|
||||
Load and store instructions
|
||||
===========================
|
||||
|
||||
For load and store instructions (BPF_LD, BPF_LDX, BPF_ST and BPF_STX), the
|
||||
For load and store instructions (``BPF_LD``, ``BPF_LDX``, ``BPF_ST``, and ``BPF_STX``), the
|
||||
8-bit 'opcode' field is divided as:
|
||||
|
||||
============ ====== =================
|
||||
3 bits (MSB) 2 bits 3 bits (LSB)
|
||||
============ ====== =================
|
||||
mode size instruction class
|
||||
============ ====== =================
|
||||
============ ====== =================
|
||||
3 bits (MSB) 2 bits 3 bits (LSB)
|
||||
============ ====== =================
|
||||
mode size instruction class
|
||||
============ ====== =================
|
||||
|
||||
The mode modifier is one of:
|
||||
|
||||
============= ===== ==================================== =============
|
||||
mode modifier value description reference
|
||||
============= ===== ==================================== =============
|
||||
BPF_IMM 0x00 64-bit immediate instructions `64-bit immediate instructions`_
|
||||
BPF_ABS 0x20 legacy BPF packet access (absolute) `Legacy BPF Packet access instructions`_
|
||||
BPF_IND 0x40 legacy BPF packet access (indirect) `Legacy BPF Packet access instructions`_
|
||||
BPF_MEM 0x60 regular load and store operations `Regular load and store operations`_
|
||||
BPF_ATOMIC 0xc0 atomic operations `Atomic operations`_
|
||||
============= ===== ==================================== =============
|
||||
|
||||
The size modifier is one of:
|
||||
|
||||
@@ -213,19 +226,6 @@ The size modifier is one of:
|
||||
BPF_DW 0x18 double word (8 bytes)
|
||||
============= ===== =====================
|
||||
|
||||
The mode modifier is one of:
|
||||
|
||||
============= ===== ====================================
|
||||
mode modifier value description
|
||||
============= ===== ====================================
|
||||
BPF_IMM 0x00 64-bit immediate instructions
|
||||
BPF_ABS 0x20 legacy BPF packet access (absolute)
|
||||
BPF_IND 0x40 legacy BPF packet access (indirect)
|
||||
BPF_MEM 0x60 regular load and store operations
|
||||
BPF_ATOMIC 0xc0 atomic operations
|
||||
============= ===== ====================================
|
||||
|
||||
|
||||
Regular load and store operations
|
||||
---------------------------------
|
||||
|
||||
@@ -256,44 +256,42 @@ by other eBPF programs or means outside of this specification.
|
||||
All atomic operations supported by eBPF are encoded as store operations
|
||||
that use the ``BPF_ATOMIC`` mode modifier as follows:
|
||||
|
||||
* ``BPF_ATOMIC | BPF_W | BPF_STX`` for 32-bit operations
|
||||
* ``BPF_ATOMIC | BPF_DW | BPF_STX`` for 64-bit operations
|
||||
* 8-bit and 16-bit wide atomic operations are not supported.
|
||||
* ``BPF_ATOMIC | BPF_W | BPF_STX`` for 32-bit operations
|
||||
* ``BPF_ATOMIC | BPF_DW | BPF_STX`` for 64-bit operations
|
||||
* 8-bit and 16-bit wide atomic operations are not supported.
|
||||
|
||||
The imm field is used to encode the actual atomic operation.
|
||||
The 'imm' field is used to encode the actual atomic operation.
|
||||
Simple atomic operation use a subset of the values defined to encode
|
||||
arithmetic operations in the imm field to encode the atomic operation:
|
||||
arithmetic operations in the 'imm' field to encode the atomic operation:
|
||||
|
||||
======== ===== ===========
|
||||
imm value description
|
||||
======== ===== ===========
|
||||
BPF_ADD 0x00 atomic add
|
||||
BPF_OR 0x40 atomic or
|
||||
BPF_AND 0x50 atomic and
|
||||
BPF_XOR 0xa0 atomic xor
|
||||
======== ===== ===========
|
||||
======== ===== ===========
|
||||
imm value description
|
||||
======== ===== ===========
|
||||
BPF_ADD 0x00 atomic add
|
||||
BPF_OR 0x40 atomic or
|
||||
BPF_AND 0x50 atomic and
|
||||
BPF_XOR 0xa0 atomic xor
|
||||
======== ===== ===========
|
||||
|
||||
|
||||
``BPF_ATOMIC | BPF_W | BPF_STX`` with imm = BPF_ADD means::
|
||||
``BPF_ATOMIC | BPF_W | BPF_STX`` with 'imm' = BPF_ADD means::
|
||||
|
||||
*(u32 *)(dst_reg + off16) += src_reg
|
||||
|
||||
``BPF_ATOMIC | BPF_DW | BPF_STX`` with imm = BPF ADD means::
|
||||
``BPF_ATOMIC | BPF_DW | BPF_STX`` with 'imm' = BPF ADD means::
|
||||
|
||||
*(u64 *)(dst_reg + off16) += src_reg
|
||||
|
||||
``BPF_XADD`` is a deprecated name for ``BPF_ATOMIC | BPF_ADD``.
|
||||
|
||||
In addition to the simple atomic operations, there also is a modifier and
|
||||
two complex atomic operations:
|
||||
|
||||
=========== ================ ===========================
|
||||
imm value description
|
||||
=========== ================ ===========================
|
||||
BPF_FETCH 0x01 modifier: return old value
|
||||
BPF_XCHG 0xe0 | BPF_FETCH atomic exchange
|
||||
BPF_CMPXCHG 0xf0 | BPF_FETCH atomic compare and exchange
|
||||
=========== ================ ===========================
|
||||
=========== ================ ===========================
|
||||
imm value description
|
||||
=========== ================ ===========================
|
||||
BPF_FETCH 0x01 modifier: return old value
|
||||
BPF_XCHG 0xe0 | BPF_FETCH atomic exchange
|
||||
BPF_CMPXCHG 0xf0 | BPF_FETCH atomic compare and exchange
|
||||
=========== ================ ===========================
|
||||
|
||||
The ``BPF_FETCH`` modifier is optional for simple atomic operations, and
|
||||
always set for the complex atomic operations. If the ``BPF_FETCH`` flag
|
||||
@@ -309,16 +307,10 @@ The ``BPF_CMPXCHG`` operation atomically compares the value addressed by
|
||||
value that was at ``dst_reg + off`` before the operation is zero-extended
|
||||
and loaded back to ``R0``.
|
||||
|
||||
Clang can generate atomic instructions by default when ``-mcpu=v3`` is
|
||||
enabled. If a lower version for ``-mcpu`` is set, the only atomic instruction
|
||||
Clang can generate is ``BPF_ADD`` *without* ``BPF_FETCH``. If you need to enable
|
||||
the atomics features, while keeping a lower ``-mcpu`` version, you can use
|
||||
``-Xclang -target-feature -Xclang +alu32``.
|
||||
|
||||
64-bit immediate instructions
|
||||
-----------------------------
|
||||
|
||||
Instructions with the ``BPF_IMM`` mode modifier use the wide instruction
|
||||
Instructions with the ``BPF_IMM`` 'mode' modifier use the wide instruction
|
||||
encoding for an extra imm64 value.
|
||||
|
||||
There is currently only one such instruction.
|
||||
@@ -331,36 +323,6 @@ There is currently only one such instruction.
|
||||
Legacy BPF Packet access instructions
|
||||
-------------------------------------
|
||||
|
||||
eBPF has special instructions for access to packet data that have been
|
||||
carried over from classic BPF to retain the performance of legacy socket
|
||||
filters running in the eBPF interpreter.
|
||||
|
||||
The instructions come in two forms: ``BPF_ABS | <size> | BPF_LD`` and
|
||||
``BPF_IND | <size> | BPF_LD``.
|
||||
|
||||
These instructions are used to access packet data and can only be used when
|
||||
the program context is a pointer to networking packet. ``BPF_ABS``
|
||||
accesses packet data at an absolute offset specified by the immediate data
|
||||
and ``BPF_IND`` access packet data at an offset that includes the value of
|
||||
a register in addition to the immediate data.
|
||||
|
||||
These instructions have seven implicit operands:
|
||||
|
||||
* Register R6 is an implicit input that must contain pointer to a
|
||||
struct sk_buff.
|
||||
* Register R0 is an implicit output which contains the data fetched from
|
||||
the packet.
|
||||
* Registers R1-R5 are scratch registers that are clobbered after a call to
|
||||
``BPF_ABS | BPF_LD`` or ``BPF_IND | BPF_LD`` instructions.
|
||||
|
||||
These instructions have an implicit program exit condition as well. When an
|
||||
eBPF program is trying to access the data beyond the packet boundary, the
|
||||
program execution will be aborted.
|
||||
|
||||
``BPF_ABS | BPF_W | BPF_LD`` means::
|
||||
|
||||
R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + imm32))
|
||||
|
||||
``BPF_IND | BPF_W | BPF_LD`` means::
|
||||
|
||||
R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32))
|
||||
eBPF previously introduced special instructions for access to packet data that were
|
||||
carried over from classic BPF. However, these instructions are
|
||||
deprecated and should no longer be used.
|
||||
|
||||
@@ -137,14 +137,37 @@ KF_ACQUIRE and KF_RET_NULL flags.
|
||||
--------------------------
|
||||
|
||||
The KF_TRUSTED_ARGS flag is used for kfuncs taking pointer arguments. It
|
||||
indicates that the all pointer arguments will always be refcounted, and have
|
||||
their offset set to 0. It can be used to enforce that a pointer to a refcounted
|
||||
object acquired from a kfunc or BPF helper is passed as an argument to this
|
||||
kfunc without any modifications (e.g. pointer arithmetic) such that it is
|
||||
trusted and points to the original object. This flag is often used for kfuncs
|
||||
that operate (change some property, perform some operation) on an object that
|
||||
was obtained using an acquire kfunc. Such kfuncs need an unchanged pointer to
|
||||
ensure the integrity of the operation being performed on the expected object.
|
||||
indicates that the all pointer arguments will always have a guaranteed lifetime,
|
||||
and pointers to kernel objects are always passed to helpers in their unmodified
|
||||
form (as obtained from acquire kfuncs).
|
||||
|
||||
It can be used to enforce that a pointer to a refcounted object acquired from a
|
||||
kfunc or BPF helper is passed as an argument to this kfunc without any
|
||||
modifications (e.g. pointer arithmetic) such that it is trusted and points to
|
||||
the original object.
|
||||
|
||||
Meanwhile, it is also allowed pass pointers to normal memory to such kfuncs,
|
||||
but those can have a non-zero offset.
|
||||
|
||||
This flag is often used for kfuncs that operate (change some property, perform
|
||||
some operation) on an object that was obtained using an acquire kfunc. Such
|
||||
kfuncs need an unchanged pointer to ensure the integrity of the operation being
|
||||
performed on the expected object.
|
||||
|
||||
2.4.6 KF_SLEEPABLE flag
|
||||
-----------------------
|
||||
|
||||
The KF_SLEEPABLE flag is used for kfuncs that may sleep. Such kfuncs can only
|
||||
be called by sleepable BPF programs (BPF_F_SLEEPABLE).
|
||||
|
||||
2.4.7 KF_DESTRUCTIVE flag
|
||||
--------------------------
|
||||
|
||||
The KF_DESTRUCTIVE flag is used to indicate functions calling which is
|
||||
destructive to the system. For example such a call can result in system
|
||||
rebooting or panicking. Due to this additional restrictions apply to these
|
||||
calls. At the moment they only require CAP_SYS_BOOT capability, but more can be
|
||||
added later.
|
||||
|
||||
2.5 Registering the kfuncs
|
||||
--------------------------
|
||||
|
||||
53
Documentation/bpf/linux-notes.rst
Normal file
53
Documentation/bpf/linux-notes.rst
Normal file
@@ -0,0 +1,53 @@
|
||||
.. contents::
|
||||
.. sectnum::
|
||||
|
||||
==========================
|
||||
Linux implementation notes
|
||||
==========================
|
||||
|
||||
This document provides more details specific to the Linux kernel implementation of the eBPF instruction set.
|
||||
|
||||
Byte swap instructions
|
||||
======================
|
||||
|
||||
``BPF_FROM_LE`` and ``BPF_FROM_BE`` exist as aliases for ``BPF_TO_LE`` and ``BPF_TO_BE`` respectively.
|
||||
|
||||
Legacy BPF Packet access instructions
|
||||
=====================================
|
||||
|
||||
As mentioned in the `ISA standard documentation <instruction-set.rst#legacy-bpf-packet-access-instructions>`_,
|
||||
Linux has special eBPF instructions for access to packet data that have been
|
||||
carried over from classic BPF to retain the performance of legacy socket
|
||||
filters running in the eBPF interpreter.
|
||||
|
||||
The instructions come in two forms: ``BPF_ABS | <size> | BPF_LD`` and
|
||||
``BPF_IND | <size> | BPF_LD``.
|
||||
|
||||
These instructions are used to access packet data and can only be used when
|
||||
the program context is a pointer to a networking packet. ``BPF_ABS``
|
||||
accesses packet data at an absolute offset specified by the immediate data
|
||||
and ``BPF_IND`` access packet data at an offset that includes the value of
|
||||
a register in addition to the immediate data.
|
||||
|
||||
These instructions have seven implicit operands:
|
||||
|
||||
* Register R6 is an implicit input that must contain a pointer to a
|
||||
struct sk_buff.
|
||||
* Register R0 is an implicit output which contains the data fetched from
|
||||
the packet.
|
||||
* Registers R1-R5 are scratch registers that are clobbered by the
|
||||
instruction.
|
||||
|
||||
These instructions have an implicit program exit condition as well. If an
|
||||
eBPF program attempts access data beyond the packet boundary, the
|
||||
program execution will be aborted.
|
||||
|
||||
``BPF_ABS | BPF_W | BPF_LD`` (0x20) means::
|
||||
|
||||
R0 = ntohl(*(u32 *) ((struct sk_buff *) R6->data + imm))
|
||||
|
||||
where ``ntohl()`` converts a 32-bit value from network byte order to host byte order.
|
||||
|
||||
``BPF_IND | BPF_W | BPF_LD`` (0x40) means::
|
||||
|
||||
R0 = ntohl(*(u32 *) ((struct sk_buff *) R6->data + src + imm))
|
||||
@@ -31,7 +31,7 @@ The map uses key of type of either ``__u64 cgroup_inode_id`` or
|
||||
};
|
||||
|
||||
``cgroup_inode_id`` is the inode id of the cgroup directory.
|
||||
``attach_type`` is the the program's attach type.
|
||||
``attach_type`` is the program's attach type.
|
||||
|
||||
Linux 5.9 added support for type ``__u64 cgroup_inode_id`` as the key type.
|
||||
When this key type is used, then all attach types of the particular cgroup and
|
||||
@@ -155,7 +155,7 @@ However, the BPF program can still only associate with one map of each type
|
||||
``BPF_MAP_TYPE_CGROUP_STORAGE`` or more than one
|
||||
``BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE``.
|
||||
|
||||
In all versions, userspace may use the the attach parameters of cgroup and
|
||||
In all versions, userspace may use the attach parameters of cgroup and
|
||||
attach type pair in ``struct bpf_cgroup_storage_key`` as the key to the BPF map
|
||||
APIs to read or update the storage for a given attachment. For Linux 5.9
|
||||
attach type shared storages, only the first value in the struct, cgroup inode
|
||||
|
||||
@@ -15,6 +15,18 @@
|
||||
import sys
|
||||
import os
|
||||
import sphinx
|
||||
import shutil
|
||||
|
||||
# helper
|
||||
# ------
|
||||
|
||||
def have_command(cmd):
|
||||
"""Search ``cmd`` in the ``PATH`` environment.
|
||||
|
||||
If found, return True.
|
||||
If not found, return False.
|
||||
"""
|
||||
return shutil.which(cmd) is not None
|
||||
|
||||
# Get Sphinx version
|
||||
major, minor, patch = sphinx.version_info[:3]
|
||||
@@ -86,6 +98,7 @@ if major >= 3:
|
||||
"__used",
|
||||
"__weak",
|
||||
"noinline",
|
||||
"__fix_address",
|
||||
|
||||
# include/linux/memblock.h:
|
||||
"__init_memblock",
|
||||
@@ -106,7 +119,32 @@ else:
|
||||
autosectionlabel_prefix_document = True
|
||||
autosectionlabel_maxdepth = 2
|
||||
|
||||
extensions.append("sphinx.ext.imgmath")
|
||||
# Load math renderer:
|
||||
# For html builder, load imgmath only when its dependencies are met.
|
||||
# mathjax is the default math renderer since Sphinx 1.8.
|
||||
have_latex = have_command('latex')
|
||||
have_dvipng = have_command('dvipng')
|
||||
load_imgmath = have_latex and have_dvipng
|
||||
|
||||
# Respect SPHINX_IMGMATH (for html docs only)
|
||||
if 'SPHINX_IMGMATH' in os.environ:
|
||||
env_sphinx_imgmath = os.environ['SPHINX_IMGMATH']
|
||||
if 'yes' in env_sphinx_imgmath:
|
||||
load_imgmath = True
|
||||
elif 'no' in env_sphinx_imgmath:
|
||||
load_imgmath = False
|
||||
else:
|
||||
sys.stderr.write("Unknown env SPHINX_IMGMATH=%s ignored.\n" % env_sphinx_imgmath)
|
||||
|
||||
# Always load imgmath for Sphinx <1.8 or for epub docs
|
||||
load_imgmath = (load_imgmath or (major == 1 and minor < 8)
|
||||
or 'epub' in sys.argv)
|
||||
|
||||
if load_imgmath:
|
||||
extensions.append("sphinx.ext.imgmath")
|
||||
math_renderer = 'imgmath'
|
||||
else:
|
||||
math_renderer = 'mathjax'
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ['_templates']
|
||||
@@ -332,7 +370,8 @@ html_static_path = ['sphinx-static']
|
||||
html_use_smartypants = False
|
||||
|
||||
# Custom sidebar templates, maps document names to template names.
|
||||
#html_sidebars = {}
|
||||
# Note that the RTD theme ignores this.
|
||||
html_sidebars = { '**': ['searchbox.html', 'localtoc.html', 'sourcelink.html']}
|
||||
|
||||
# Additional templates that should be rendered to pages, maps page names to
|
||||
# template names.
|
||||
|
||||
@@ -43,10 +43,11 @@ annotated objects like this, tools can be run on them to generate more useful
|
||||
information. In particular, on properly annotated objects, ``objtool`` can be
|
||||
run to check and fix the object if needed. Currently, ``objtool`` can report
|
||||
missing frame pointer setup/destruction in functions. It can also
|
||||
automatically generate annotations for :doc:`ORC unwinder <x86/orc-unwinder>`
|
||||
automatically generate annotations for the ORC unwinder
|
||||
(Documentation/x86/orc-unwinder.rst)
|
||||
for most code. Both of these are especially important to support reliable
|
||||
stack traces which are in turn necessary for :doc:`Kernel live patching
|
||||
<livepatch/livepatch>`.
|
||||
stack traces which are in turn necessary for kernel live patching
|
||||
(Documentation/livepatch/livepatch.rst).
|
||||
|
||||
Caveat and Discussion
|
||||
---------------------
|
||||
@@ -560,7 +560,7 @@ available:
|
||||
* cpuhp_state_remove_instance(state, node)
|
||||
* cpuhp_state_remove_instance_nocalls(state, node)
|
||||
|
||||
The arguments are the same as for the the cpuhp_state_add_instance*()
|
||||
The arguments are the same as for the cpuhp_state_add_instance*()
|
||||
variants above.
|
||||
|
||||
The functions differ in the way how the installed callbacks are treated:
|
||||
|
||||
@@ -23,6 +23,7 @@ it.
|
||||
printk-formats
|
||||
printk-index
|
||||
symbol-namespaces
|
||||
asm-annotations
|
||||
|
||||
Data structures and low-level utilities
|
||||
=======================================
|
||||
@@ -36,6 +37,7 @@ Library functionality that is used throughout the kernel.
|
||||
kref
|
||||
assoc_array
|
||||
xarray
|
||||
maple_tree
|
||||
idr
|
||||
circular-buffers
|
||||
rbtree
|
||||
@@ -44,6 +46,8 @@ Library functionality that is used throughout the kernel.
|
||||
this_cpu_ops
|
||||
timekeeping
|
||||
errseq
|
||||
wrappers/atomic_t
|
||||
wrappers/atomic_bitops
|
||||
|
||||
Low level entry and exit
|
||||
========================
|
||||
@@ -67,6 +71,7 @@ Documentation/locking/index.rst for more related documentation.
|
||||
local_ops
|
||||
padata
|
||||
../RCU/index
|
||||
wrappers/memory-barriers.rst
|
||||
|
||||
Low-level hardware management
|
||||
=============================
|
||||
|
||||
@@ -71,7 +71,7 @@ variety of methods:
|
||||
Note that irq domain lookups must happen in contexts that are
|
||||
compatible with a RCU read-side critical section.
|
||||
|
||||
The irq_create_mapping() function must be called *atleast once*
|
||||
The irq_create_mapping() function must be called *at least once*
|
||||
before any call to irq_find_mapping(), lest the descriptor will not
|
||||
be allocated.
|
||||
|
||||
|
||||
217
Documentation/core-api/maple_tree.rst
Normal file
217
Documentation/core-api/maple_tree.rst
Normal file
@@ -0,0 +1,217 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0+
|
||||
|
||||
|
||||
==========
|
||||
Maple Tree
|
||||
==========
|
||||
|
||||
:Author: Liam R. Howlett
|
||||
|
||||
Overview
|
||||
========
|
||||
|
||||
The Maple Tree is a B-Tree data type which is optimized for storing
|
||||
non-overlapping ranges, including ranges of size 1. The tree was designed to
|
||||
be simple to use and does not require a user written search method. It
|
||||
supports iterating over a range of entries and going to the previous or next
|
||||
entry in a cache-efficient manner. The tree can also be put into an RCU-safe
|
||||
mode of operation which allows reading and writing concurrently. Writers must
|
||||
synchronize on a lock, which can be the default spinlock, or the user can set
|
||||
the lock to an external lock of a different type.
|
||||
|
||||
The Maple Tree maintains a small memory footprint and was designed to use
|
||||
modern processor cache efficiently. The majority of the users will be able to
|
||||
use the normal API. An :ref:`maple-tree-advanced-api` exists for more complex
|
||||
scenarios. The most important usage of the Maple Tree is the tracking of the
|
||||
virtual memory areas.
|
||||
|
||||
The Maple Tree can store values between ``0`` and ``ULONG_MAX``. The Maple
|
||||
Tree reserves values with the bottom two bits set to '10' which are below 4096
|
||||
(ie 2, 6, 10 .. 4094) for internal use. If the entries may use reserved
|
||||
entries then the users can convert the entries using xa_mk_value() and convert
|
||||
them back by calling xa_to_value(). If the user needs to use a reserved
|
||||
value, then the user can convert the value when using the
|
||||
:ref:`maple-tree-advanced-api`, but are blocked by the normal API.
|
||||
|
||||
The Maple Tree can also be configured to support searching for a gap of a given
|
||||
size (or larger).
|
||||
|
||||
Pre-allocating of nodes is also supported using the
|
||||
:ref:`maple-tree-advanced-api`. This is useful for users who must guarantee a
|
||||
successful store operation within a given
|
||||
code segment when allocating cannot be done. Allocations of nodes are
|
||||
relatively small at around 256 bytes.
|
||||
|
||||
.. _maple-tree-normal-api:
|
||||
|
||||
Normal API
|
||||
==========
|
||||
|
||||
Start by initialising a maple tree, either with DEFINE_MTREE() for statically
|
||||
allocated maple trees or mt_init() for dynamically allocated ones. A
|
||||
freshly-initialised maple tree contains a ``NULL`` pointer for the range ``0``
|
||||
- ``ULONG_MAX``. There are currently two types of maple trees supported: the
|
||||
allocation tree and the regular tree. The regular tree has a higher branching
|
||||
factor for internal nodes. The allocation tree has a lower branching factor
|
||||
but allows the user to search for a gap of a given size or larger from either
|
||||
``0`` upwards or ``ULONG_MAX`` down. An allocation tree can be used by
|
||||
passing in the ``MT_FLAGS_ALLOC_RANGE`` flag when initialising the tree.
|
||||
|
||||
You can then set entries using mtree_store() or mtree_store_range().
|
||||
mtree_store() will overwrite any entry with the new entry and return 0 on
|
||||
success or an error code otherwise. mtree_store_range() works in the same way
|
||||
but takes a range. mtree_load() is used to retrieve the entry stored at a
|
||||
given index. You can use mtree_erase() to erase an entire range by only
|
||||
knowing one value within that range, or mtree_store() call with an entry of
|
||||
NULL may be used to partially erase a range or many ranges at once.
|
||||
|
||||
If you want to only store a new entry to a range (or index) if that range is
|
||||
currently ``NULL``, you can use mtree_insert_range() or mtree_insert() which
|
||||
return -EEXIST if the range is not empty.
|
||||
|
||||
You can search for an entry from an index upwards by using mt_find().
|
||||
|
||||
You can walk each entry within a range by calling mt_for_each(). You must
|
||||
provide a temporary variable to store a cursor. If you want to walk each
|
||||
element of the tree then ``0`` and ``ULONG_MAX`` may be used as the range. If
|
||||
the caller is going to hold the lock for the duration of the walk then it is
|
||||
worth looking at the mas_for_each() API in the :ref:`maple-tree-advanced-api`
|
||||
section.
|
||||
|
||||
Sometimes it is necessary to ensure the next call to store to a maple tree does
|
||||
not allocate memory, please see :ref:`maple-tree-advanced-api` for this use case.
|
||||
|
||||
Finally, you can remove all entries from a maple tree by calling
|
||||
mtree_destroy(). If the maple tree entries are pointers, you may wish to free
|
||||
the entries first.
|
||||
|
||||
Allocating Nodes
|
||||
----------------
|
||||
|
||||
The allocations are handled by the internal tree code. See
|
||||
:ref:`maple-tree-advanced-alloc` for other options.
|
||||
|
||||
Locking
|
||||
-------
|
||||
|
||||
You do not have to worry about locking. See :ref:`maple-tree-advanced-locks`
|
||||
for other options.
|
||||
|
||||
The Maple Tree uses RCU and an internal spinlock to synchronise access:
|
||||
|
||||
Takes RCU read lock:
|
||||
* mtree_load()
|
||||
* mt_find()
|
||||
* mt_for_each()
|
||||
* mt_next()
|
||||
* mt_prev()
|
||||
|
||||
Takes ma_lock internally:
|
||||
* mtree_store()
|
||||
* mtree_store_range()
|
||||
* mtree_insert()
|
||||
* mtree_insert_range()
|
||||
* mtree_erase()
|
||||
* mtree_destroy()
|
||||
* mt_set_in_rcu()
|
||||
* mt_clear_in_rcu()
|
||||
|
||||
If you want to take advantage of the internal lock to protect the data
|
||||
structures that you are storing in the Maple Tree, you can call mtree_lock()
|
||||
before calling mtree_load(), then take a reference count on the object you
|
||||
have found before calling mtree_unlock(). This will prevent stores from
|
||||
removing the object from the tree between looking up the object and
|
||||
incrementing the refcount. You can also use RCU to avoid dereferencing
|
||||
freed memory, but an explanation of that is beyond the scope of this
|
||||
document.
|
||||
|
||||
.. _maple-tree-advanced-api:
|
||||
|
||||
Advanced API
|
||||
============
|
||||
|
||||
The advanced API offers more flexibility and better performance at the
|
||||
cost of an interface which can be harder to use and has fewer safeguards.
|
||||
You must take care of your own locking while using the advanced API.
|
||||
You can use the ma_lock, RCU or an external lock for protection.
|
||||
You can mix advanced and normal operations on the same array, as long
|
||||
as the locking is compatible. The :ref:`maple-tree-normal-api` is implemented
|
||||
in terms of the advanced API.
|
||||
|
||||
The advanced API is based around the ma_state, this is where the 'mas'
|
||||
prefix originates. The ma_state struct keeps track of tree operations to make
|
||||
life easier for both internal and external tree users.
|
||||
|
||||
Initialising the maple tree is the same as in the :ref:`maple-tree-normal-api`.
|
||||
Please see above.
|
||||
|
||||
The maple state keeps track of the range start and end in mas->index and
|
||||
mas->last, respectively.
|
||||
|
||||
mas_walk() will walk the tree to the location of mas->index and set the
|
||||
mas->index and mas->last according to the range for the entry.
|
||||
|
||||
You can set entries using mas_store(). mas_store() will overwrite any entry
|
||||
with the new entry and return the first existing entry that is overwritten.
|
||||
The range is passed in as members of the maple state: index and last.
|
||||
|
||||
You can use mas_erase() to erase an entire range by setting index and
|
||||
last of the maple state to the desired range to erase. This will erase
|
||||
the first range that is found in that range, set the maple state index
|
||||
and last as the range that was erased and return the entry that existed
|
||||
at that location.
|
||||
|
||||
You can walk each entry within a range by using mas_for_each(). If you want
|
||||
to walk each element of the tree then ``0`` and ``ULONG_MAX`` may be used as
|
||||
the range. If the lock needs to be periodically dropped, see the locking
|
||||
section mas_pause().
|
||||
|
||||
Using a maple state allows mas_next() and mas_prev() to function as if the
|
||||
tree was a linked list. With such a high branching factor the amortized
|
||||
performance penalty is outweighed by cache optimization. mas_next() will
|
||||
return the next entry which occurs after the entry at index. mas_prev()
|
||||
will return the previous entry which occurs before the entry at index.
|
||||
|
||||
mas_find() will find the first entry which exists at or above index on
|
||||
the first call, and the next entry from every subsequent calls.
|
||||
|
||||
mas_find_rev() will find the fist entry which exists at or below the last on
|
||||
the first call, and the previous entry from every subsequent calls.
|
||||
|
||||
If the user needs to yield the lock during an operation, then the maple state
|
||||
must be paused using mas_pause().
|
||||
|
||||
There are a few extra interfaces provided when using an allocation tree.
|
||||
If you wish to search for a gap within a range, then mas_empty_area()
|
||||
or mas_empty_area_rev() can be used. mas_empty_area() searches for a gap
|
||||
starting at the lowest index given up to the maximum of the range.
|
||||
mas_empty_area_rev() searches for a gap starting at the highest index given
|
||||
and continues downward to the lower bound of the range.
|
||||
|
||||
.. _maple-tree-advanced-alloc:
|
||||
|
||||
Advanced Allocating Nodes
|
||||
-------------------------
|
||||
|
||||
Allocations are usually handled internally to the tree, however if allocations
|
||||
need to occur before a write occurs then calling mas_expected_entries() will
|
||||
allocate the worst-case number of needed nodes to insert the provided number of
|
||||
ranges. This also causes the tree to enter mass insertion mode. Once
|
||||
insertions are complete calling mas_destroy() on the maple state will free the
|
||||
unused allocations.
|
||||
|
||||
.. _maple-tree-advanced-locks:
|
||||
|
||||
Advanced Locking
|
||||
----------------
|
||||
|
||||
The maple tree uses a spinlock by default, but external locks can be used for
|
||||
tree updates as well. To use an external lock, the tree must be initialized
|
||||
with the ``MT_FLAGS_LOCK_EXTERN flag``, this is usually done with the
|
||||
MTREE_INIT_EXT() #define, which takes an external lock as an argument.
|
||||
|
||||
Functions and structures
|
||||
========================
|
||||
|
||||
.. kernel-doc:: include/linux/maple_tree.h
|
||||
.. kernel-doc:: lib/maple_tree.c
|
||||
@@ -19,9 +19,6 @@ User Space Memory Access
|
||||
Memory Allocation Controls
|
||||
==========================
|
||||
|
||||
.. kernel-doc:: include/linux/gfp.h
|
||||
:internal:
|
||||
|
||||
.. kernel-doc:: include/linux/gfp_types.h
|
||||
:doc: Page mobility and placement hints
|
||||
|
||||
|
||||
@@ -625,6 +625,16 @@ Examples::
|
||||
%p4cc Y10 little-endian (0x20303159)
|
||||
%p4cc NV12 big-endian (0xb231564e)
|
||||
|
||||
Rust
|
||||
----
|
||||
|
||||
::
|
||||
|
||||
%pA
|
||||
|
||||
Only intended to be used from Rust code to format ``core::fmt::Arguments``.
|
||||
Do *not* use it from C.
|
||||
|
||||
Thanks
|
||||
======
|
||||
|
||||
|
||||
18
Documentation/core-api/wrappers/atomic_bitops.rst
Normal file
18
Documentation/core-api/wrappers/atomic_bitops.rst
Normal file
@@ -0,0 +1,18 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
This is a simple wrapper to bring atomic_bitops.txt into the RST world
|
||||
until such a time as that file can be converted directly.
|
||||
|
||||
=============
|
||||
Atomic bitops
|
||||
=============
|
||||
|
||||
.. raw:: latex
|
||||
|
||||
\footnotesize
|
||||
|
||||
.. include:: ../../atomic_bitops.txt
|
||||
:literal:
|
||||
|
||||
.. raw:: latex
|
||||
|
||||
\normalsize
|
||||
19
Documentation/core-api/wrappers/atomic_t.rst
Normal file
19
Documentation/core-api/wrappers/atomic_t.rst
Normal file
@@ -0,0 +1,19 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
This is a simple wrapper to bring atomic_t.txt into the RST world
|
||||
until such a time as that file can be converted directly.
|
||||
|
||||
============
|
||||
Atomic types
|
||||
============
|
||||
|
||||
.. raw:: latex
|
||||
|
||||
\footnotesize
|
||||
|
||||
.. include:: ../../atomic_t.txt
|
||||
:literal:
|
||||
|
||||
.. raw:: latex
|
||||
|
||||
\normalsize
|
||||
|
||||
18
Documentation/core-api/wrappers/memory-barriers.rst
Normal file
18
Documentation/core-api/wrappers/memory-barriers.rst
Normal file
@@ -0,0 +1,18 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
This is a simple wrapper to bring memory-barriers.txt into the RST world
|
||||
until such a time as that file can be converted directly.
|
||||
|
||||
============================
|
||||
Linux kernel memory barriers
|
||||
============================
|
||||
|
||||
.. raw:: latex
|
||||
|
||||
\footnotesize
|
||||
|
||||
.. include:: ../../memory-barriers.txt
|
||||
:literal:
|
||||
|
||||
.. raw:: latex
|
||||
|
||||
\normalsize
|
||||
@@ -612,6 +612,13 @@ Commit message
|
||||
|
||||
See: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes
|
||||
|
||||
**BAD_FIXES_TAG**
|
||||
The Fixes: tag is malformed or does not follow the community conventions.
|
||||
This can occur if the tag have been split into multiple lines (e.g., when
|
||||
pasted in an email program with word wrapping enabled).
|
||||
|
||||
See: https://www.kernel.org/doc/html/latest/process/submitting-patches.html#describe-your-changes
|
||||
|
||||
|
||||
Comparison style
|
||||
----------------
|
||||
|
||||
@@ -24,6 +24,7 @@ Documentation/dev-tools/testing-overview.rst
|
||||
kcov
|
||||
gcov
|
||||
kasan
|
||||
kmsan
|
||||
ubsan
|
||||
kmemleak
|
||||
kcsan
|
||||
|
||||
@@ -111,9 +111,17 @@ parameter can be used to control panic and reporting behaviour:
|
||||
report or also panic the kernel (default: ``report``). The panic happens even
|
||||
if ``kasan_multi_shot`` is enabled.
|
||||
|
||||
Hardware Tag-Based KASAN mode (see the section about various modes below) is
|
||||
intended for use in production as a security mitigation. Therefore, it supports
|
||||
additional boot parameters that allow disabling KASAN or controlling features:
|
||||
Software and Hardware Tag-Based KASAN modes (see the section about various
|
||||
modes below) support altering stack trace collection behavior:
|
||||
|
||||
- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
|
||||
traces collection (default: ``on``).
|
||||
- ``kasan.stack_ring_size=<number of entries>`` specifies the number of entries
|
||||
in the stack ring (default: ``32768``).
|
||||
|
||||
Hardware Tag-Based KASAN mode is intended for use in production as a security
|
||||
mitigation. Therefore, it supports additional boot parameters that allow
|
||||
disabling KASAN altogether or controlling its features:
|
||||
|
||||
- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
|
||||
|
||||
@@ -132,9 +140,6 @@ additional boot parameters that allow disabling KASAN or controlling features:
|
||||
- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc
|
||||
allocations (default: ``on``).
|
||||
|
||||
- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
|
||||
traces collection (default: ``on``).
|
||||
|
||||
Error reports
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
|
||||
427
Documentation/dev-tools/kmsan.rst
Normal file
427
Documentation/dev-tools/kmsan.rst
Normal file
@@ -0,0 +1,427 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
.. Copyright (C) 2022, Google LLC.
|
||||
|
||||
===================================
|
||||
The Kernel Memory Sanitizer (KMSAN)
|
||||
===================================
|
||||
|
||||
KMSAN is a dynamic error detector aimed at finding uses of uninitialized
|
||||
values. It is based on compiler instrumentation, and is quite similar to the
|
||||
userspace `MemorySanitizer tool`_.
|
||||
|
||||
An important note is that KMSAN is not intended for production use, because it
|
||||
drastically increases kernel memory footprint and slows the whole system down.
|
||||
|
||||
Usage
|
||||
=====
|
||||
|
||||
Building the kernel
|
||||
-------------------
|
||||
|
||||
In order to build a kernel with KMSAN you will need a fresh Clang (14.0.6+).
|
||||
Please refer to `LLVM documentation`_ for the instructions on how to build Clang.
|
||||
|
||||
Now configure and build the kernel with CONFIG_KMSAN enabled.
|
||||
|
||||
Example report
|
||||
--------------
|
||||
|
||||
Here is an example of a KMSAN report::
|
||||
|
||||
=====================================================
|
||||
BUG: KMSAN: uninit-value in test_uninit_kmsan_check_memory+0x1be/0x380 [kmsan_test]
|
||||
test_uninit_kmsan_check_memory+0x1be/0x380 mm/kmsan/kmsan_test.c:273
|
||||
kunit_run_case_internal lib/kunit/test.c:333
|
||||
kunit_try_run_case+0x206/0x420 lib/kunit/test.c:374
|
||||
kunit_generic_run_threadfn_adapter+0x6d/0xc0 lib/kunit/try-catch.c:28
|
||||
kthread+0x721/0x850 kernel/kthread.c:327
|
||||
ret_from_fork+0x1f/0x30 ??:?
|
||||
|
||||
Uninit was stored to memory at:
|
||||
do_uninit_local_array+0xfa/0x110 mm/kmsan/kmsan_test.c:260
|
||||
test_uninit_kmsan_check_memory+0x1a2/0x380 mm/kmsan/kmsan_test.c:271
|
||||
kunit_run_case_internal lib/kunit/test.c:333
|
||||
kunit_try_run_case+0x206/0x420 lib/kunit/test.c:374
|
||||
kunit_generic_run_threadfn_adapter+0x6d/0xc0 lib/kunit/try-catch.c:28
|
||||
kthread+0x721/0x850 kernel/kthread.c:327
|
||||
ret_from_fork+0x1f/0x30 ??:?
|
||||
|
||||
Local variable uninit created at:
|
||||
do_uninit_local_array+0x4a/0x110 mm/kmsan/kmsan_test.c:256
|
||||
test_uninit_kmsan_check_memory+0x1a2/0x380 mm/kmsan/kmsan_test.c:271
|
||||
|
||||
Bytes 4-7 of 8 are uninitialized
|
||||
Memory access of size 8 starts at ffff888083fe3da0
|
||||
|
||||
CPU: 0 PID: 6731 Comm: kunit_try_catch Tainted: G B E 5.16.0-rc3+ #104
|
||||
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
|
||||
=====================================================
|
||||
|
||||
The report says that the local variable ``uninit`` was created uninitialized in
|
||||
``do_uninit_local_array()``. The third stack trace corresponds to the place
|
||||
where this variable was created.
|
||||
|
||||
The first stack trace shows where the uninit value was used (in
|
||||
``test_uninit_kmsan_check_memory()``). The tool shows the bytes which were left
|
||||
uninitialized in the local variable, as well as the stack where the value was
|
||||
copied to another memory location before use.
|
||||
|
||||
A use of uninitialized value ``v`` is reported by KMSAN in the following cases:
|
||||
- in a condition, e.g. ``if (v) { ... }``;
|
||||
- in an indexing or pointer dereferencing, e.g. ``array[v]`` or ``*v``;
|
||||
- when it is copied to userspace or hardware, e.g. ``copy_to_user(..., &v, ...)``;
|
||||
- when it is passed as an argument to a function, and
|
||||
``CONFIG_KMSAN_CHECK_PARAM_RETVAL`` is enabled (see below).
|
||||
|
||||
The mentioned cases (apart from copying data to userspace or hardware, which is
|
||||
a security issue) are considered undefined behavior from the C11 Standard point
|
||||
of view.
|
||||
|
||||
Disabling the instrumentation
|
||||
-----------------------------
|
||||
|
||||
A function can be marked with ``__no_kmsan_checks``. Doing so makes KMSAN
|
||||
ignore uninitialized values in that function and mark its output as initialized.
|
||||
As a result, the user will not get KMSAN reports related to that function.
|
||||
|
||||
Another function attribute supported by KMSAN is ``__no_sanitize_memory``.
|
||||
Applying this attribute to a function will result in KMSAN not instrumenting
|
||||
it, which can be helpful if we do not want the compiler to interfere with some
|
||||
low-level code (e.g. that marked with ``noinstr`` which implicitly adds
|
||||
``__no_sanitize_memory``).
|
||||
|
||||
This however comes at a cost: stack allocations from such functions will have
|
||||
incorrect shadow/origin values, likely leading to false positives. Functions
|
||||
called from non-instrumented code may also receive incorrect metadata for their
|
||||
parameters.
|
||||
|
||||
As a rule of thumb, avoid using ``__no_sanitize_memory`` explicitly.
|
||||
|
||||
It is also possible to disable KMSAN for a single file (e.g. main.o)::
|
||||
|
||||
KMSAN_SANITIZE_main.o := n
|
||||
|
||||
or for the whole directory::
|
||||
|
||||
KMSAN_SANITIZE := n
|
||||
|
||||
in the Makefile. Think of this as applying ``__no_sanitize_memory`` to every
|
||||
function in the file or directory. Most users won't need KMSAN_SANITIZE, unless
|
||||
their code gets broken by KMSAN (e.g. runs at early boot time).
|
||||
|
||||
Support
|
||||
=======
|
||||
|
||||
In order for KMSAN to work the kernel must be built with Clang, which so far is
|
||||
the only compiler that has KMSAN support. The kernel instrumentation pass is
|
||||
based on the userspace `MemorySanitizer tool`_.
|
||||
|
||||
The runtime library only supports x86_64 at the moment.
|
||||
|
||||
How KMSAN works
|
||||
===============
|
||||
|
||||
KMSAN shadow memory
|
||||
-------------------
|
||||
|
||||
KMSAN associates a metadata byte (also called shadow byte) with every byte of
|
||||
kernel memory. A bit in the shadow byte is set iff the corresponding bit of the
|
||||
kernel memory byte is uninitialized. Marking the memory uninitialized (i.e.
|
||||
setting its shadow bytes to ``0xff``) is called poisoning, marking it
|
||||
initialized (setting the shadow bytes to ``0x00``) is called unpoisoning.
|
||||
|
||||
When a new variable is allocated on the stack, it is poisoned by default by
|
||||
instrumentation code inserted by the compiler (unless it is a stack variable
|
||||
that is immediately initialized). Any new heap allocation done without
|
||||
``__GFP_ZERO`` is also poisoned.
|
||||
|
||||
Compiler instrumentation also tracks the shadow values as they are used along
|
||||
the code. When needed, instrumentation code invokes the runtime library in
|
||||
``mm/kmsan/`` to persist shadow values.
|
||||
|
||||
The shadow value of a basic or compound type is an array of bytes of the same
|
||||
length. When a constant value is written into memory, that memory is unpoisoned.
|
||||
When a value is read from memory, its shadow memory is also obtained and
|
||||
propagated into all the operations which use that value. For every instruction
|
||||
that takes one or more values the compiler generates code that calculates the
|
||||
shadow of the result depending on those values and their shadows.
|
||||
|
||||
Example::
|
||||
|
||||
int a = 0xff; // i.e. 0x000000ff
|
||||
int b;
|
||||
int c = a | b;
|
||||
|
||||
In this case the shadow of ``a`` is ``0``, shadow of ``b`` is ``0xffffffff``,
|
||||
shadow of ``c`` is ``0xffffff00``. This means that the upper three bytes of
|
||||
``c`` are uninitialized, while the lower byte is initialized.
|
||||
|
||||
Origin tracking
|
||||
---------------
|
||||
|
||||
Every four bytes of kernel memory also have a so-called origin mapped to them.
|
||||
This origin describes the point in program execution at which the uninitialized
|
||||
value was created. Every origin is associated with either the full allocation
|
||||
stack (for heap-allocated memory), or the function containing the uninitialized
|
||||
variable (for locals).
|
||||
|
||||
When an uninitialized variable is allocated on stack or heap, a new origin
|
||||
value is created, and that variable's origin is filled with that value. When a
|
||||
value is read from memory, its origin is also read and kept together with the
|
||||
shadow. For every instruction that takes one or more values, the origin of the
|
||||
result is one of the origins corresponding to any of the uninitialized inputs.
|
||||
If a poisoned value is written into memory, its origin is written to the
|
||||
corresponding storage as well.
|
||||
|
||||
Example 1::
|
||||
|
||||
int a = 42;
|
||||
int b;
|
||||
int c = a + b;
|
||||
|
||||
In this case the origin of ``b`` is generated upon function entry, and is
|
||||
stored to the origin of ``c`` right before the addition result is written into
|
||||
memory.
|
||||
|
||||
Several variables may share the same origin address, if they are stored in the
|
||||
same four-byte chunk. In this case every write to either variable updates the
|
||||
origin for all of them. We have to sacrifice precision in this case, because
|
||||
storing origins for individual bits (and even bytes) would be too costly.
|
||||
|
||||
Example 2::
|
||||
|
||||
int combine(short a, short b) {
|
||||
union ret_t {
|
||||
int i;
|
||||
short s[2];
|
||||
} ret;
|
||||
ret.s[0] = a;
|
||||
ret.s[1] = b;
|
||||
return ret.i;
|
||||
}
|
||||
|
||||
If ``a`` is initialized and ``b`` is not, the shadow of the result would be
|
||||
0xffff0000, and the origin of the result would be the origin of ``b``.
|
||||
``ret.s[0]`` would have the same origin, but it will never be used, because
|
||||
that variable is initialized.
|
||||
|
||||
If both function arguments are uninitialized, only the origin of the second
|
||||
argument is preserved.
|
||||
|
||||
Origin chaining
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
To ease debugging, KMSAN creates a new origin for every store of an
|
||||
uninitialized value to memory. The new origin references both its creation stack
|
||||
and the previous origin the value had. This may cause increased memory
|
||||
consumption, so we limit the length of origin chains in the runtime.
|
||||
|
||||
Clang instrumentation API
|
||||
-------------------------
|
||||
|
||||
Clang instrumentation pass inserts calls to functions defined in
|
||||
``mm/kmsan/nstrumentation.c`` into the kernel code.
|
||||
|
||||
Shadow manipulation
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
For every memory access the compiler emits a call to a function that returns a
|
||||
pair of pointers to the shadow and origin addresses of the given memory::
|
||||
|
||||
typedef struct {
|
||||
void *shadow, *origin;
|
||||
} shadow_origin_ptr_t
|
||||
|
||||
shadow_origin_ptr_t __msan_metadata_ptr_for_load_{1,2,4,8}(void *addr)
|
||||
shadow_origin_ptr_t __msan_metadata_ptr_for_store_{1,2,4,8}(void *addr)
|
||||
shadow_origin_ptr_t __msan_metadata_ptr_for_load_n(void *addr, uintptr_t size)
|
||||
shadow_origin_ptr_t __msan_metadata_ptr_for_store_n(void *addr, uintptr_t size)
|
||||
|
||||
The function name depends on the memory access size.
|
||||
|
||||
The compiler makes sure that for every loaded value its shadow and origin
|
||||
values are read from memory. When a value is stored to memory, its shadow and
|
||||
origin are also stored using the metadata pointers.
|
||||
|
||||
Handling locals
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
A special function is used to create a new origin value for a local variable and
|
||||
set the origin of that variable to that value::
|
||||
|
||||
void __msan_poison_alloca(void *addr, uintptr_t size, char *descr)
|
||||
|
||||
Access to per-task data
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
At the beginning of every instrumented function KMSAN inserts a call to
|
||||
``__msan_get_context_state()``::
|
||||
|
||||
kmsan_context_state *__msan_get_context_state(void)
|
||||
|
||||
``kmsan_context_state`` is declared in ``include/linux/kmsan.h``::
|
||||
|
||||
struct kmsan_context_state {
|
||||
char param_tls[KMSAN_PARAM_SIZE];
|
||||
char retval_tls[KMSAN_RETVAL_SIZE];
|
||||
char va_arg_tls[KMSAN_PARAM_SIZE];
|
||||
char va_arg_origin_tls[KMSAN_PARAM_SIZE];
|
||||
u64 va_arg_overflow_size_tls;
|
||||
char param_origin_tls[KMSAN_PARAM_SIZE];
|
||||
depot_stack_handle_t retval_origin_tls;
|
||||
};
|
||||
|
||||
This structure is used by KMSAN to pass parameter shadows and origins between
|
||||
instrumented functions (unless the parameters are checked immediately by
|
||||
``CONFIG_KMSAN_CHECK_PARAM_RETVAL``).
|
||||
|
||||
Passing uninitialized values to functions
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Clang's MemorySanitizer instrumentation has an option,
|
||||
``-fsanitize-memory-param-retval``, which makes the compiler check function
|
||||
parameters passed by value, as well as function return values.
|
||||
|
||||
The option is controlled by ``CONFIG_KMSAN_CHECK_PARAM_RETVAL``, which is
|
||||
enabled by default to let KMSAN report uninitialized values earlier.
|
||||
Please refer to the `LKML discussion`_ for more details.
|
||||
|
||||
Because of the way the checks are implemented in LLVM (they are only applied to
|
||||
parameters marked as ``noundef``), not all parameters are guaranteed to be
|
||||
checked, so we cannot give up the metadata storage in ``kmsan_context_state``.
|
||||
|
||||
String functions
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
The compiler replaces calls to ``memcpy()``/``memmove()``/``memset()`` with the
|
||||
following functions. These functions are also called when data structures are
|
||||
initialized or copied, making sure shadow and origin values are copied alongside
|
||||
with the data::
|
||||
|
||||
void *__msan_memcpy(void *dst, void *src, uintptr_t n)
|
||||
void *__msan_memmove(void *dst, void *src, uintptr_t n)
|
||||
void *__msan_memset(void *dst, int c, uintptr_t n)
|
||||
|
||||
Error reporting
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
For each use of a value the compiler emits a shadow check that calls
|
||||
``__msan_warning()`` in the case that value is poisoned::
|
||||
|
||||
void __msan_warning(u32 origin)
|
||||
|
||||
``__msan_warning()`` causes KMSAN runtime to print an error report.
|
||||
|
||||
Inline assembly instrumentation
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
KMSAN instruments every inline assembly output with a call to::
|
||||
|
||||
void __msan_instrument_asm_store(void *addr, uintptr_t size)
|
||||
|
||||
, which unpoisons the memory region.
|
||||
|
||||
This approach may mask certain errors, but it also helps to avoid a lot of
|
||||
false positives in bitwise operations, atomics etc.
|
||||
|
||||
Sometimes the pointers passed into inline assembly do not point to valid memory.
|
||||
In such cases they are ignored at runtime.
|
||||
|
||||
|
||||
Runtime library
|
||||
---------------
|
||||
|
||||
The code is located in ``mm/kmsan/``.
|
||||
|
||||
Per-task KMSAN state
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Every task_struct has an associated KMSAN task state that holds the KMSAN
|
||||
context (see above) and a per-task flag disallowing KMSAN reports::
|
||||
|
||||
struct kmsan_context {
|
||||
...
|
||||
bool allow_reporting;
|
||||
struct kmsan_context_state cstate;
|
||||
...
|
||||
}
|
||||
|
||||
struct task_struct {
|
||||
...
|
||||
struct kmsan_context kmsan;
|
||||
...
|
||||
}
|
||||
|
||||
KMSAN contexts
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
When running in a kernel task context, KMSAN uses ``current->kmsan.cstate`` to
|
||||
hold the metadata for function parameters and return values.
|
||||
|
||||
But in the case the kernel is running in the interrupt, softirq or NMI context,
|
||||
where ``current`` is unavailable, KMSAN switches to per-cpu interrupt state::
|
||||
|
||||
DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx);
|
||||
|
||||
Metadata allocation
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
There are several places in the kernel for which the metadata is stored.
|
||||
|
||||
1. Each ``struct page`` instance contains two pointers to its shadow and
|
||||
origin pages::
|
||||
|
||||
struct page {
|
||||
...
|
||||
struct page *shadow, *origin;
|
||||
...
|
||||
};
|
||||
|
||||
At boot-time, the kernel allocates shadow and origin pages for every available
|
||||
kernel page. This is done quite late, when the kernel address space is already
|
||||
fragmented, so normal data pages may arbitrarily interleave with the metadata
|
||||
pages.
|
||||
|
||||
This means that in general for two contiguous memory pages their shadow/origin
|
||||
pages may not be contiguous. Consequently, if a memory access crosses the
|
||||
boundary of a memory block, accesses to shadow/origin memory may potentially
|
||||
corrupt other pages or read incorrect values from them.
|
||||
|
||||
In practice, contiguous memory pages returned by the same ``alloc_pages()``
|
||||
call will have contiguous metadata, whereas if these pages belong to two
|
||||
different allocations their metadata pages can be fragmented.
|
||||
|
||||
For the kernel data (``.data``, ``.bss`` etc.) and percpu memory regions
|
||||
there also are no guarantees on metadata contiguity.
|
||||
|
||||
In the case ``__msan_metadata_ptr_for_XXX_YYY()`` hits the border between two
|
||||
pages with non-contiguous metadata, it returns pointers to fake shadow/origin regions::
|
||||
|
||||
char dummy_load_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
|
||||
char dummy_store_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE)));
|
||||
|
||||
``dummy_load_page`` is zero-initialized, so reads from it always yield zeroes.
|
||||
All stores to ``dummy_store_page`` are ignored.
|
||||
|
||||
2. For vmalloc memory and modules, there is a direct mapping between the memory
|
||||
range, its shadow and origin. KMSAN reduces the vmalloc area by 3/4, making only
|
||||
the first quarter available to ``vmalloc()``. The second quarter of the vmalloc
|
||||
area contains shadow memory for the first quarter, the third one holds the
|
||||
origins. A small part of the fourth quarter contains shadow and origins for the
|
||||
kernel modules. Please refer to ``arch/x86/include/asm/pgtable_64_types.h`` for
|
||||
more details.
|
||||
|
||||
When an array of pages is mapped into a contiguous virtual memory space, their
|
||||
shadow and origin pages are similarly mapped into contiguous regions.
|
||||
|
||||
References
|
||||
==========
|
||||
|
||||
E. Stepanov, K. Serebryany. `MemorySanitizer: fast detector of uninitialized
|
||||
memory use in C++
|
||||
<https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/43308.pdf>`_.
|
||||
In Proceedings of CGO 2015.
|
||||
|
||||
.. _MemorySanitizer tool: https://clang.llvm.org/docs/MemorySanitizer.html
|
||||
.. _LLVM documentation: https://llvm.org/docs/GettingStarted.html
|
||||
.. _LKML discussion: https://lore.kernel.org/all/20220614144853.3693273-1-glider@google.com/
|
||||
@@ -320,7 +320,7 @@ A bare bones test module might look like this:
|
||||
|
||||
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
||||
|
||||
#include "../tools/testing/selftests/kselftest/module.h"
|
||||
#include "../tools/testing/selftests/kselftest_module.h"
|
||||
|
||||
KSTM_MODULE_GLOBALS();
|
||||
|
||||
|
||||
@@ -6,8 +6,8 @@ KUnit Architecture
|
||||
|
||||
The KUnit architecture can be divided into two parts:
|
||||
|
||||
- Kernel testing library
|
||||
- kunit_tool (Command line test harness)
|
||||
- `In-Kernel Testing Framework`_
|
||||
- `kunit_tool (Command Line Test Harness)`_
|
||||
|
||||
In-Kernel Testing Framework
|
||||
===========================
|
||||
|
||||
@@ -31,13 +31,16 @@ For the most part, the KUnit core framework (what we use to write the tests)
|
||||
can compile to any architecture. It compiles like just another part of the
|
||||
kernel and runs when the kernel boots, or when built as a module, when the
|
||||
module is loaded. However, there is infrastructure, like the KUnit Wrapper
|
||||
(``tools/testing/kunit/kunit.py``) that does not support other architectures.
|
||||
(``tools/testing/kunit/kunit.py``) that might not support some architectures
|
||||
(see :ref:`kunit-on-qemu`).
|
||||
|
||||
In short, yes, you can run KUnit on other architectures, but it might require
|
||||
more work than using KUnit on UML.
|
||||
|
||||
For more information, see :ref:`kunit-on-non-uml`.
|
||||
|
||||
.. _kinds-of-tests:
|
||||
|
||||
What is the difference between a unit test and other kinds of tests?
|
||||
====================================================================
|
||||
Most existing tests for the Linux kernel would be categorized as an integration
|
||||
@@ -95,8 +98,7 @@ things to try.
|
||||
seeing. When tests are built-in, they will execute when the kernel boots, and
|
||||
modules will automatically execute associated tests when loaded. Test results
|
||||
can be collected from ``/sys/kernel/debug/kunit/<test suite>/results``, and
|
||||
can be parsed with ``kunit.py parse``. For more details, see "KUnit on
|
||||
non-UML architectures" in Documentation/dev-tools/kunit/usage.rst.
|
||||
can be parsed with ``kunit.py parse``. For more details, see :ref:`kunit-on-qemu`.
|
||||
|
||||
If none of the above tricks help, you are always welcome to email any issues to
|
||||
kunit-dev@googlegroups.com.
|
||||
|
||||
@@ -13,7 +13,6 @@ KUnit - Linux Kernel Unit Testing
|
||||
run_wrapper
|
||||
run_manual
|
||||
usage
|
||||
kunit-tool
|
||||
api/index
|
||||
style
|
||||
faq
|
||||
@@ -29,10 +28,10 @@ KUnit (Kernel unit testing framework) provides a common framework for
|
||||
unit tests within the Linux kernel. Using KUnit, you can define groups
|
||||
of test cases called test suites. The tests either run on kernel boot
|
||||
if built-in, or load as a module. KUnit automatically flags and reports
|
||||
failed test cases in the kernel log. The test results appear in `TAP
|
||||
(Test Anything Protocol) format <https://testanything.org/>`_. It is inspired by
|
||||
JUnit, Python’s unittest.mock, and GoogleTest/GoogleMock (C++ unit testing
|
||||
framework).
|
||||
failed test cases in the kernel log. The test results appear in
|
||||
:doc:`KTAP (Kernel - Test Anything Protocol) format</dev-tools/ktap>`.
|
||||
It is inspired by JUnit, Python’s unittest.mock, and GoogleTest/GoogleMock
|
||||
(C++ unit testing framework).
|
||||
|
||||
KUnit tests are part of the kernel, written in the C (programming)
|
||||
language, and test parts of the Kernel implementation (example: a C
|
||||
@@ -46,8 +45,9 @@ internal system functionality. KUnit runs in kernel space and is not
|
||||
restricted to things exposed to user-space.
|
||||
|
||||
In addition, KUnit has kunit_tool, a script (``tools/testing/kunit/kunit.py``)
|
||||
that configures the Linux kernel, runs KUnit tests under QEMU or UML (`User Mode
|
||||
Linux <http://user-mode-linux.sourceforge.net/>`_), parses the test results and
|
||||
that configures the Linux kernel, runs KUnit tests under QEMU or UML
|
||||
(:doc:`User Mode Linux </virt/uml/user_mode_linux_howto_v2>`),
|
||||
parses the test results and
|
||||
displays them in a user friendly manner.
|
||||
|
||||
Features
|
||||
@@ -95,6 +95,8 @@ Unit Testing Advantages
|
||||
- Improves code quality.
|
||||
- Encourages writing testable code.
|
||||
|
||||
Read also :ref:`kinds-of-tests`.
|
||||
|
||||
How do I use it?
|
||||
================
|
||||
|
||||
@@ -107,7 +109,5 @@ How do I use it?
|
||||
examples.
|
||||
* Documentation/dev-tools/kunit/api/index.rst - KUnit APIs
|
||||
used for testing.
|
||||
* Documentation/dev-tools/kunit/kunit-tool.rst - kunit_tool helper
|
||||
script.
|
||||
* Documentation/dev-tools/kunit/faq.rst - KUnit common questions and
|
||||
answers.
|
||||
|
||||
@@ -1,232 +0,0 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=================
|
||||
kunit_tool How-To
|
||||
=================
|
||||
|
||||
What is kunit_tool?
|
||||
===================
|
||||
|
||||
kunit_tool is a script (``tools/testing/kunit/kunit.py``) that aids in building
|
||||
the Linux kernel as UML (`User Mode Linux
|
||||
<http://user-mode-linux.sourceforge.net/>`_), running KUnit tests, parsing
|
||||
the test results and displaying them in a user friendly manner.
|
||||
|
||||
kunit_tool addresses the problem of being able to run tests without needing a
|
||||
virtual machine or actual hardware with User Mode Linux. User Mode Linux is a
|
||||
Linux architecture, like ARM or x86; however, unlike other architectures it
|
||||
compiles the kernel as a standalone Linux executable that can be run like any
|
||||
other program directly inside of a host operating system. To be clear, it does
|
||||
not require any virtualization support: it is just a regular program.
|
||||
|
||||
What is a .kunitconfig?
|
||||
=======================
|
||||
|
||||
It's just a defconfig that kunit_tool looks for in the build directory
|
||||
(``.kunit`` by default). kunit_tool uses it to generate a .config as you might
|
||||
expect. In addition, it verifies that the generated .config contains the CONFIG
|
||||
options in the .kunitconfig; the reason it does this is so that it is easy to
|
||||
be sure that a CONFIG that enables a test actually ends up in the .config.
|
||||
|
||||
It's also possible to pass a separate .kunitconfig fragment to kunit_tool,
|
||||
which is useful if you have several different groups of tests you wish
|
||||
to run independently, or if you want to use pre-defined test configs for
|
||||
certain subsystems.
|
||||
|
||||
Getting Started with kunit_tool
|
||||
===============================
|
||||
|
||||
If a kunitconfig is present at the root directory, all you have to do is:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tools/testing/kunit/kunit.py run
|
||||
|
||||
However, you most likely want to use it with the following options:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tools/testing/kunit/kunit.py run --timeout=30 --jobs=`nproc --all`
|
||||
|
||||
- ``--timeout`` sets a maximum amount of time to allow tests to run.
|
||||
- ``--jobs`` sets the number of threads to use to build the kernel.
|
||||
|
||||
.. note::
|
||||
This command will work even without a .kunitconfig file: if no
|
||||
.kunitconfig is present, a default one will be used instead.
|
||||
|
||||
If you wish to use a different .kunitconfig file (such as one provided for
|
||||
testing a particular subsystem), you can pass it as an option.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tools/testing/kunit/kunit.py run --kunitconfig=fs/ext4/.kunitconfig
|
||||
|
||||
For a list of all the flags supported by kunit_tool, you can run:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tools/testing/kunit/kunit.py run --help
|
||||
|
||||
Configuring, Building, and Running Tests
|
||||
========================================
|
||||
|
||||
It's also possible to run just parts of the KUnit build process independently,
|
||||
which is useful if you want to make manual changes to part of the process.
|
||||
|
||||
A .config can be generated from a .kunitconfig by using the ``config`` argument
|
||||
when running kunit_tool:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tools/testing/kunit/kunit.py config
|
||||
|
||||
Similarly, if you just want to build a KUnit kernel from the current .config,
|
||||
you can use the ``build`` argument:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tools/testing/kunit/kunit.py build
|
||||
|
||||
And, if you already have a built UML kernel with built-in KUnit tests, you can
|
||||
run the kernel and display the test results with the ``exec`` argument:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tools/testing/kunit/kunit.py exec
|
||||
|
||||
The ``run`` command which is discussed above is equivalent to running all three
|
||||
of these in sequence.
|
||||
|
||||
All of these commands accept a number of optional command-line arguments. The
|
||||
``--help`` flag will give a complete list of these, or keep reading this page
|
||||
for a guide to some of the more useful ones.
|
||||
|
||||
Parsing Test Results
|
||||
====================
|
||||
|
||||
KUnit tests output their results in TAP (Test Anything Protocol) format.
|
||||
kunit_tool will, when running tests, parse this output and print a summary
|
||||
which is much more pleasant to read. If you wish to look at the raw test
|
||||
results in TAP format, you can pass the ``--raw_output`` argument.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tools/testing/kunit/kunit.py run --raw_output
|
||||
|
||||
The raw output from test runs may contain other, non-KUnit kernel log
|
||||
lines. You can see just KUnit output with ``--raw_output=kunit``:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tools/testing/kunit/kunit.py run --raw_output=kunit
|
||||
|
||||
If you have KUnit results in their raw TAP format, you can parse them and print
|
||||
the human-readable summary with the ``parse`` command for kunit_tool. This
|
||||
accepts a filename for an argument, or will read from standard input.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
# Reading from a file
|
||||
./tools/testing/kunit/kunit.py parse /var/log/dmesg
|
||||
# Reading from stdin
|
||||
dmesg | ./tools/testing/kunit/kunit.py parse
|
||||
|
||||
This is very useful if you wish to run tests in a configuration not supported
|
||||
by kunit_tool (such as on real hardware, or an unsupported architecture).
|
||||
|
||||
Filtering Tests
|
||||
===============
|
||||
|
||||
It's possible to run only a subset of the tests built into a kernel by passing
|
||||
a filter to the ``exec`` or ``run`` commands. For example, if you only wanted
|
||||
to run KUnit resource tests, you could use:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tools/testing/kunit/kunit.py run 'kunit-resource*'
|
||||
|
||||
This uses the standard glob format for wildcards.
|
||||
|
||||
Running Tests on QEMU
|
||||
=====================
|
||||
|
||||
kunit_tool supports running tests on QEMU as well as via UML (as mentioned
|
||||
elsewhere). The default way of running tests on QEMU requires two flags:
|
||||
|
||||
``--arch``
|
||||
Selects a collection of configs (Kconfig as well as QEMU configs
|
||||
options, etc) that allow KUnit tests to be run on the specified
|
||||
architecture in a minimal way; this is usually not much slower than
|
||||
using UML. The architecture argument is the same as the name of the
|
||||
option passed to the ``ARCH`` variable used by Kbuild. Not all
|
||||
architectures are currently supported by this flag, but can be handled
|
||||
by the ``--qemu_config`` discussed later. If ``um`` is passed (or this
|
||||
this flag is ignored) the tests will run via UML. Non-UML architectures,
|
||||
e.g. i386, x86_64, arm, um, etc. Non-UML run on QEMU.
|
||||
|
||||
``--cross_compile``
|
||||
Specifies the use of a toolchain by Kbuild. The argument passed here is
|
||||
the same passed to the ``CROSS_COMPILE`` variable used by Kbuild. As a
|
||||
reminder this will be the prefix for the toolchain binaries such as gcc
|
||||
for example ``sparc64-linux-gnu-`` if you have the sparc toolchain
|
||||
installed on your system, or
|
||||
``$HOME/toolchains/microblaze/gcc-9.2.0-nolibc/microblaze-linux/bin/microblaze-linux-``
|
||||
if you have downloaded the microblaze toolchain from the 0-day website
|
||||
to a directory in your home directory called ``toolchains``.
|
||||
|
||||
In many cases it is likely that you may want to run an architecture which is
|
||||
not supported by the ``--arch`` flag, or you may want to just run KUnit tests
|
||||
on QEMU using a non-default configuration. For this use case, you can write
|
||||
your own QemuConfig. These QemuConfigs are written in Python. They must have an
|
||||
import line ``from ..qemu_config import QemuArchParams`` at the top of the file
|
||||
and the file must contain a variable called ``QEMU_ARCH`` that has an instance
|
||||
of ``QemuArchParams`` assigned to it. An example can be seen in
|
||||
``tools/testing/kunit/qemu_configs/x86_64.py``.
|
||||
|
||||
Once you have a QemuConfig you can pass it into kunit_tool using the
|
||||
``--qemu_config`` flag; when used this flag replaces the ``--arch`` flag. If we
|
||||
were to do this with the ``x86_64.py`` example from above, the invocation would
|
||||
look something like this:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tools/testing/kunit/kunit.py run \
|
||||
--timeout=60 \
|
||||
--jobs=12 \
|
||||
--qemu_config=./tools/testing/kunit/qemu_configs/x86_64.py
|
||||
|
||||
Other Useful Options
|
||||
====================
|
||||
|
||||
kunit_tool has a number of other command-line arguments which can be useful
|
||||
when adapting it to fit your environment or needs.
|
||||
|
||||
Some of the more useful ones are:
|
||||
|
||||
``--help``
|
||||
Lists all of the available options. Note that different commands
|
||||
(``config``, ``build``, ``run``, etc) will have different supported
|
||||
options. Place ``--help`` before the command to list common options,
|
||||
and after the command for options specific to that command.
|
||||
|
||||
``--build_dir``
|
||||
Specifies the build directory that kunit_tool will use. This is where
|
||||
the .kunitconfig file is located, as well as where the .config and
|
||||
compiled kernel will be placed. Defaults to ``.kunit``.
|
||||
|
||||
``--make_options``
|
||||
Specifies additional options to pass to ``make`` when compiling a
|
||||
kernel (with the ``build`` or ``run`` commands). For example, to enable
|
||||
compiler warnings, you can pass ``--make_options W=1``.
|
||||
|
||||
``--alltests``
|
||||
Builds a UML kernel with all config options enabled using ``make
|
||||
allyesconfig``. This allows you to run as many tests as is possible,
|
||||
but is very slow and prone to breakage as new options are added or
|
||||
modified. In most cases, enabling all tests which have satisfied
|
||||
dependencies by adding ``CONFIG_KUNIT_ALL_TESTS=1`` to your
|
||||
.kunitconfig is preferable.
|
||||
|
||||
There are several other options (and new ones are often added), so do check
|
||||
``--help`` if you're looking for something not mentioned here.
|
||||
@@ -1,8 +1,8 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=========================
|
||||
Run Tests with kunit_tool
|
||||
=========================
|
||||
=============================
|
||||
Running tests with kunit_tool
|
||||
=============================
|
||||
|
||||
We can either run KUnit tests using kunit_tool or can run tests
|
||||
manually, and then use kunit_tool to parse the results. To run tests
|
||||
@@ -22,7 +22,7 @@ We should see the following:
|
||||
|
||||
.. code-block::
|
||||
|
||||
Generating .config...
|
||||
Configuring KUnit Kernel ...
|
||||
Building KUnit kernel...
|
||||
Starting KUnit kernel...
|
||||
|
||||
@@ -30,7 +30,7 @@ We may want to use the following options:
|
||||
|
||||
.. code-block::
|
||||
|
||||
./tools/testing/kunit/kunit.py run --timeout=30 --jobs=`nproc --all
|
||||
./tools/testing/kunit/kunit.py run --timeout=30 --jobs=`nproc --all`
|
||||
|
||||
- ``--timeout`` sets a maximum amount of time for tests to run.
|
||||
- ``--jobs`` sets the number of threads to build the kernel.
|
||||
@@ -58,8 +58,8 @@ To view kunit_tool flags (optional command-line arguments), run:
|
||||
|
||||
./tools/testing/kunit/kunit.py run --help
|
||||
|
||||
Create a ``.kunitconfig`` File
|
||||
===============================
|
||||
Creating a ``.kunitconfig`` file
|
||||
================================
|
||||
|
||||
If we want to run a specific set of tests (rather than those listed
|
||||
in the KUnit ``defconfig``), we can provide Kconfig options in the
|
||||
@@ -98,8 +98,8 @@ have not included the options dependencies.
|
||||
The build dir needs to be set for ``make menuconfig`` to
|
||||
work, therefore by default use ``make O=.kunit menuconfig``.
|
||||
|
||||
Configure, Build, and Run Tests
|
||||
===============================
|
||||
Configuring, building, and running tests
|
||||
========================================
|
||||
|
||||
If we want to make manual changes to the KUnit build process, we
|
||||
can run part of the KUnit build process independently.
|
||||
@@ -125,11 +125,11 @@ argument:
|
||||
|
||||
./tools/testing/kunit/kunit.py exec
|
||||
|
||||
The ``run`` command discussed in section: **Run Tests with kunit_tool**,
|
||||
The ``run`` command discussed in section: **Running tests with kunit_tool**,
|
||||
is equivalent to running the above three commands in sequence.
|
||||
|
||||
Parse Test Results
|
||||
==================
|
||||
Parsing test results
|
||||
====================
|
||||
|
||||
KUnit tests output displays results in TAP (Test Anything Protocol)
|
||||
format. When running tests, kunit_tool parses this output and prints
|
||||
@@ -152,8 +152,8 @@ standard input.
|
||||
# Reading from stdin
|
||||
dmesg | ./tools/testing/kunit/kunit.py parse
|
||||
|
||||
Run Selected Test Suites
|
||||
========================
|
||||
Filtering tests
|
||||
===============
|
||||
|
||||
By passing a bash style glob filter to the ``exec`` or ``run``
|
||||
commands, we can run a subset of the tests built into a kernel . For
|
||||
@@ -165,8 +165,10 @@ example: if we only want to run KUnit resource tests, use:
|
||||
|
||||
This uses the standard glob format with wildcard characters.
|
||||
|
||||
Run Tests on qemu
|
||||
=================
|
||||
.. _kunit-on-qemu:
|
||||
|
||||
Running tests on QEMU
|
||||
=====================
|
||||
|
||||
kunit_tool supports running tests on qemu as well as
|
||||
via UML. To run tests on qemu, by default it requires two flags:
|
||||
@@ -229,8 +231,8 @@ as
|
||||
--jobs=12 \
|
||||
--qemu_config=./tools/testing/kunit/qemu_configs/x86_64.py
|
||||
|
||||
Command-Line Arguments
|
||||
======================
|
||||
Running command-line arguments
|
||||
==============================
|
||||
|
||||
kunit_tool has a number of other command-line arguments which can
|
||||
be useful for our test environment. Below are the most commonly used
|
||||
@@ -249,14 +251,15 @@ command line arguments:
|
||||
compiling a kernel (using ``build`` or ``run`` commands). For example:
|
||||
to enable compiler warnings, we can pass ``--make_options W=1``.
|
||||
|
||||
- ``--alltests``: Builds a UML kernel with all config options enabled
|
||||
using ``make allyesconfig``. This allows us to run as many tests as
|
||||
possible.
|
||||
- ``--alltests``: Enable a predefined set of options in order to build
|
||||
as many tests as possible.
|
||||
|
||||
.. note:: It is slow and prone to breakage as new options are
|
||||
added or modified. Instead, enable all tests
|
||||
which have satisfied dependencies by adding
|
||||
``CONFIG_KUNIT_ALL_TESTS=y`` to your ``.kunitconfig``.
|
||||
.. note:: The list of enabled options can be found in
|
||||
``tools/testing/kunit/configs/all_tests.config``.
|
||||
|
||||
If you only want to enable all tests with otherwise satisfied
|
||||
dependencies, instead add ``CONFIG_KUNIT_ALL_TESTS=y`` to your
|
||||
``.kunitconfig``.
|
||||
|
||||
- ``--kunitconfig``: Specifies the path or the directory of the ``.kunitconfig``
|
||||
file. For example:
|
||||
|
||||
@@ -4,6 +4,10 @@
|
||||
Getting Started
|
||||
===============
|
||||
|
||||
This page contains an overview of the kunit_tool and KUnit framework,
|
||||
teaching how to run existing tests and then how to write a simple test case,
|
||||
and covers common problems users face when using KUnit for the first time.
|
||||
|
||||
Installing Dependencies
|
||||
=======================
|
||||
KUnit has the same dependencies as the Linux kernel. As long as you can
|
||||
@@ -19,30 +23,53 @@ can run kunit_tool:
|
||||
|
||||
./tools/testing/kunit/kunit.py run
|
||||
|
||||
For more information on this wrapper, see:
|
||||
.. note ::
|
||||
You may see the following error:
|
||||
"The source tree is not clean, please run 'make ARCH=um mrproper'"
|
||||
|
||||
This happens because internally kunit.py specifies ``.kunit``
|
||||
(default option) as the build directory in the command ``make O=output/dir``
|
||||
through the argument ``--build_dir``. Hence, before starting an
|
||||
out-of-tree build, the source tree must be clean.
|
||||
|
||||
There is also the same caveat mentioned in the "Build directory for
|
||||
the kernel" section of the :doc:`admin-guide </admin-guide/README>`,
|
||||
that is, its use, it must be used for all invocations of ``make``.
|
||||
The good news is that it can indeed be solved by running
|
||||
``make ARCH=um mrproper``, just be aware that this will delete the
|
||||
current configuration and all generated files.
|
||||
|
||||
If everything worked correctly, you should see the following:
|
||||
|
||||
.. code-block::
|
||||
|
||||
Configuring KUnit Kernel ...
|
||||
Building KUnit Kernel ...
|
||||
Starting KUnit Kernel ...
|
||||
|
||||
The tests will pass or fail.
|
||||
|
||||
.. note ::
|
||||
Because it is building a lot of sources for the first time,
|
||||
the ``Building KUnit Kernel`` step may take a while.
|
||||
|
||||
For detailed information on this wrapper, see:
|
||||
Documentation/dev-tools/kunit/run_wrapper.rst.
|
||||
|
||||
Creating a ``.kunitconfig``
|
||||
---------------------------
|
||||
Selecting which tests to run
|
||||
----------------------------
|
||||
|
||||
By default, kunit_tool runs a selection of tests. However, you can specify which
|
||||
unit tests to run by creating a ``.kunitconfig`` file with kernel config options
|
||||
that enable only a specific set of tests and their dependencies.
|
||||
The ``.kunitconfig`` file contains a list of kconfig options which are required
|
||||
to run the desired targets. The ``.kunitconfig`` also contains any other test
|
||||
specific config options, such as test dependencies. For example: the
|
||||
``FAT_FS`` tests - ``FAT_KUNIT_TEST``, depends on
|
||||
``FAT_FS``. ``FAT_FS`` can be enabled by selecting either ``MSDOS_FS``
|
||||
or ``VFAT_FS``. To run ``FAT_KUNIT_TEST``, the ``.kunitconfig`` has:
|
||||
By default, kunit_tool runs all tests reachable with minimal configuration,
|
||||
that is, using default values for most of the kconfig options. However,
|
||||
you can select which tests to run by:
|
||||
|
||||
.. code-block:: none
|
||||
- `Customizing Kconfig`_ used to compile the kernel, or
|
||||
- `Filtering tests by name`_ to select specifically which compiled tests to run.
|
||||
|
||||
CONFIG_KUNIT=y
|
||||
CONFIG_MSDOS_FS=y
|
||||
CONFIG_FAT_KUNIT_TEST=y
|
||||
|
||||
1. A good starting point for the ``.kunitconfig`` is the KUnit default config.
|
||||
You can generate it by running:
|
||||
Customizing Kconfig
|
||||
~~~~~~~~~~~~~~~~~~~
|
||||
A good starting point for the ``.kunitconfig`` is the KUnit default config.
|
||||
If you didn't run ``kunit.py run`` yet, you can generate it by running:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@@ -54,48 +81,69 @@ or ``VFAT_FS``. To run ``FAT_KUNIT_TEST``, the ``.kunitconfig`` has:
|
||||
``.kunitconfig`` lives in the ``--build_dir`` used by kunit.py, which is
|
||||
``.kunit`` by default.
|
||||
|
||||
.. note ::
|
||||
You may want to remove CONFIG_KUNIT_ALL_TESTS from the ``.kunitconfig`` as
|
||||
it will enable a number of additional tests that you may not want.
|
||||
|
||||
2. You can then add any other Kconfig options, for example:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
CONFIG_LIST_KUNIT_TEST=y
|
||||
|
||||
Before running the tests, kunit_tool ensures that all config options
|
||||
set in ``.kunitconfig`` are set in the kernel ``.config``. It will warn
|
||||
you if you have not included dependencies for the options used.
|
||||
|
||||
.. note ::
|
||||
If you change the ``.kunitconfig``, kunit.py will trigger a rebuild of the
|
||||
There are many ways to customize the configurations:
|
||||
|
||||
a. Edit ``.kunit/.kunitconfig``. The file should contain the list of kconfig
|
||||
options required to run the desired tests, including their dependencies.
|
||||
You may want to remove CONFIG_KUNIT_ALL_TESTS from the ``.kunitconfig`` as
|
||||
it will enable a number of additional tests that you may not want.
|
||||
If you need to run on an architecture other than UML see :ref:`kunit-on-qemu`.
|
||||
|
||||
b. Enable additional kconfig options on top of ``.kunit/.kunitconfig``.
|
||||
For example, to include the kernel's linked-list test you can run::
|
||||
|
||||
./tools/testing/kunit/kunit.py run \
|
||||
--kconfig_add CONFIG_LIST_KUNIT_TEST=y
|
||||
|
||||
c. Provide the path of one or more .kunitconfig files from the tree.
|
||||
For example, to run only ``FAT_FS`` and ``EXT4`` tests you can run::
|
||||
|
||||
./tools/testing/kunit/kunit.py run \
|
||||
--kunitconfig ./fs/fat/.kunitconfig \
|
||||
--kunitconfig ./fs/ext4/.kunitconfig
|
||||
|
||||
d. If you change the ``.kunitconfig``, kunit.py will trigger a rebuild of the
|
||||
``.config`` file. But you can edit the ``.config`` file directly or with
|
||||
tools like ``make menuconfig O=.kunit``. As long as its a superset of
|
||||
``.kunitconfig``, kunit.py won't overwrite your changes.
|
||||
|
||||
Running Tests (KUnit Wrapper)
|
||||
-----------------------------
|
||||
1. To make sure that everything is set up correctly, invoke the Python
|
||||
wrapper from your kernel repository:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
./tools/testing/kunit/kunit.py run
|
||||
|
||||
If everything worked correctly, you should see the following:
|
||||
|
||||
.. code-block::
|
||||
|
||||
Generating .config ...
|
||||
Building KUnit Kernel ...
|
||||
Starting KUnit Kernel ...
|
||||
|
||||
The tests will pass or fail.
|
||||
|
||||
.. note ::
|
||||
Because it is building a lot of sources for the first time, the
|
||||
``Building KUnit kernel`` may take a while.
|
||||
|
||||
To save a .kunitconfig after finding a satisfactory configuration::
|
||||
|
||||
make savedefconfig O=.kunit
|
||||
cp .kunit/defconfig .kunit/.kunitconfig
|
||||
|
||||
Filtering tests by name
|
||||
~~~~~~~~~~~~~~~~~~~~~~~
|
||||
If you want to be more specific than Kconfig can provide, it is also possible
|
||||
to select which tests to execute at boot-time by passing a glob filter
|
||||
(read instructions regarding the pattern in the manpage :manpage:`glob(7)`).
|
||||
If there is a ``"."`` (period) in the filter, it will be interpreted as a
|
||||
separator between the name of the test suite and the test case,
|
||||
otherwise, it will be interpreted as the name of the test suite.
|
||||
For example, let's assume we are using the default config:
|
||||
|
||||
a. inform the name of a test suite, like ``"kunit_executor_test"``,
|
||||
to run every test case it contains::
|
||||
|
||||
./tools/testing/kunit/kunit.py run "kunit_executor_test"
|
||||
|
||||
b. inform the name of a test case prefixed by its test suite,
|
||||
like ``"example.example_simple_test"``, to run specifically that test case::
|
||||
|
||||
./tools/testing/kunit/kunit.py run "example.example_simple_test"
|
||||
|
||||
c. use wildcard characters (``*?[``) to run any test case that matches the pattern,
|
||||
like ``"*.*64*"`` to run test cases containing ``"64"`` in the name inside
|
||||
any test suite::
|
||||
|
||||
./tools/testing/kunit/kunit.py run "*.*64*"
|
||||
|
||||
Running Tests without the KUnit Wrapper
|
||||
=======================================
|
||||
@@ -217,7 +265,7 @@ Now we are ready to write the test cases.
|
||||
|
||||
obj-$(CONFIG_MISC_EXAMPLE_TEST) += example_test.o
|
||||
|
||||
4. Add the following lines to ``.kunitconfig``:
|
||||
4. Add the following lines to ``.kunit/.kunitconfig``:
|
||||
|
||||
.. code-block:: none
|
||||
|
||||
@@ -254,7 +302,5 @@ Next Steps
|
||||
examples.
|
||||
* Documentation/dev-tools/kunit/api/index.rst - KUnit APIs
|
||||
used for testing.
|
||||
* Documentation/dev-tools/kunit/kunit-tool.rst - kunit_tool helper
|
||||
script.
|
||||
* Documentation/dev-tools/kunit/faq.rst - KUnit common questions and
|
||||
answers.
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user