mirror of
https://github.com/lkl/linux.git
synced 2025-12-19 16:13:19 +09:00
Merge tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
"Almost all of MM here. A few things are still getting finished off,
reviewed, etc.
- Yang Shi has improved the behaviour of khugepaged collapsing of
readonly file-backed transparent hugepages.
- Johannes Weiner has arranged for zswap memory use to be tracked and
managed on a per-cgroup basis.
- Munchun Song adds a /proc knob ("hugetlb_optimize_vmemmap") for
runtime enablement of the recent huge page vmemmap optimization
feature.
- Baolin Wang contributes a series to fix some issues around hugetlb
pagetable invalidation.
- Zhenwei Pi has fixed some interactions between hwpoisoned pages and
virtualization.
- Tong Tiangen has enabled the use of the presently x86-only
page_table_check debugging feature on arm64 and riscv.
- David Vernet has done some fixup work on the memcg selftests.
- Peter Xu has taught userfaultfd to handle write protection faults
against shmem- and hugetlbfs-backed files.
- More DAMON development from SeongJae Park - adding online tuning of
the feature and support for monitoring of fixed virtual address
ranges. Also easier discovery of which monitoring operations are
available.
- Nadav Amit has done some optimization of TLB flushing during
mprotect().
- Neil Brown continues to labor away at improving our swap-over-NFS
support.
- David Hildenbrand has some fixes to anon page COWing versus
get_user_pages().
- Peng Liu fixed some errors in the core hugetlb code.
- Joao Martins has reduced the amount of memory consumed by
device-dax's compound devmaps.
- Some cleanups of the arch-specific pagemap code from Anshuman
Khandual.
- Muchun Song has found and fixed some errors in the TLB flushing of
transparent hugepages.
- Roman Gushchin has done more work on the memcg selftests.
... and, of course, many smaller fixes and cleanups. Notably, the
customary million cleanup serieses from Miaohe Lin"
* tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (381 commits)
mm: kfence: use PAGE_ALIGNED helper
selftests: vm: add the "settings" file with timeout variable
selftests: vm: add "test_hmm.sh" to TEST_FILES
selftests: vm: check numa_available() before operating "merge_across_nodes" in ksm_tests
selftests: vm: add migration to the .gitignore
selftests/vm/pkeys: fix typo in comment
ksm: fix typo in comment
selftests: vm: add process_mrelease tests
Revert "mm/vmscan: never demote for memcg reclaim"
mm/kfence: print disabling or re-enabling message
include/trace/events/percpu.h: cleanup for "percpu: improve percpu_alloc_percpu event trace"
include/trace/events/mmflags.h: cleanup for "tracing: incorrect gfp_t conversion"
mm: fix a potential infinite loop in start_isolate_page_range()
MAINTAINERS: add Muchun as co-maintainer for HugeTLB
zram: fix Kconfig dependency warning
mm/shmem: fix shmem folio swapoff hang
cgroup: fix an error handling path in alloc_pagecache_max_30M()
mm: damon: use HPAGE_PMD_SIZE
tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate
nodemask.h: fix compilation error with GCC12
...
This commit is contained in:
@@ -23,9 +23,10 @@ Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing 'on' or 'off' to this file makes the kdamond starts or
|
||||
stops, respectively. Reading the file returns the keywords
|
||||
based on the current status. Writing 'update_schemes_stats' to
|
||||
the file updates contents of schemes stats files of the
|
||||
kdamond.
|
||||
based on the current status. Writing 'commit' to this file
|
||||
makes the kdamond reads the user inputs in the sysfs files
|
||||
except 'state' again. Writing 'update_schemes_stats' to the
|
||||
file updates contents of schemes stats files of the kdamond.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/pid
|
||||
Date: Mar 2022
|
||||
@@ -40,14 +41,24 @@ Description: Writing a number 'N' to this file creates the number of
|
||||
directories for controlling each DAMON context named '0' to
|
||||
'N-1' under the contexts/ directory.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/avail_operations
|
||||
Date: Apr 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Reading this file returns the available monitoring operations
|
||||
sets on the currently running kernel.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/operations
|
||||
Date: Mar 2022
|
||||
Contact: SeongJae Park <sj@kernel.org>
|
||||
Description: Writing a keyword for a monitoring operations set ('vaddr' for
|
||||
virtual address spaces monitoring, and 'paddr' for the physical
|
||||
address space monitoring) to this file makes the context to use
|
||||
the operations set. Reading the file returns the keyword for
|
||||
the operations set the context is set to use.
|
||||
virtual address spaces monitoring, 'fvaddr' for fixed virtual
|
||||
address ranges monitoring, and 'paddr' for the physical address
|
||||
space monitoring) to this file makes the context to use the
|
||||
operations set. Reading the file returns the keyword for the
|
||||
operations set the context is set to use.
|
||||
|
||||
Note that only the operations sets that listed in
|
||||
'avail_operations' file are valid inputs.
|
||||
|
||||
What: /sys/kernel/mm/damon/admin/kdamonds/<K>/contexts/<C>/monitoring_attrs/intervals/sample_us
|
||||
Date: Mar 2022
|
||||
|
||||
@@ -343,6 +343,11 @@ Admin can request writeback of those idle pages at right timing via::
|
||||
|
||||
With the command, zram will writeback idle pages from memory to the storage.
|
||||
|
||||
Additionally, if a user choose to writeback only huge and idle pages
|
||||
this can be accomplished with::
|
||||
|
||||
echo huge_idle > /sys/block/zramX/writeback
|
||||
|
||||
If an admin wants to write a specific page in zram device to the backing device,
|
||||
they could write a page index into the interface.
|
||||
|
||||
|
||||
@@ -1208,6 +1208,34 @@ PAGE_SIZE multiple when read back.
|
||||
high limit is used and monitored properly, this limit's
|
||||
utility is limited to providing the final safety net.
|
||||
|
||||
memory.reclaim
|
||||
A write-only nested-keyed file which exists for all cgroups.
|
||||
|
||||
This is a simple interface to trigger memory reclaim in the
|
||||
target cgroup.
|
||||
|
||||
This file accepts a single key, the number of bytes to reclaim.
|
||||
No nested keys are currently supported.
|
||||
|
||||
Example::
|
||||
|
||||
echo "1G" > memory.reclaim
|
||||
|
||||
The interface can be later extended with nested keys to
|
||||
configure the reclaim behavior. For example, specify the
|
||||
type of memory to reclaim from (anon, file, ..).
|
||||
|
||||
Please note that the kernel can over or under reclaim from
|
||||
the target cgroup. If less bytes are reclaimed than the
|
||||
specified amount, -EAGAIN is returned.
|
||||
|
||||
memory.peak
|
||||
A read-only single value file which exists on non-root
|
||||
cgroups.
|
||||
|
||||
The max memory usage recorded for the cgroup and its
|
||||
descendants since the creation of the cgroup.
|
||||
|
||||
memory.oom.group
|
||||
A read-write single value file which exists on non-root
|
||||
cgroups. The default value is "0".
|
||||
@@ -1326,6 +1354,12 @@ PAGE_SIZE multiple when read back.
|
||||
Amount of cached filesystem data that is swap-backed,
|
||||
such as tmpfs, shm segments, shared anonymous mmap()s
|
||||
|
||||
zswap
|
||||
Amount of memory consumed by the zswap compression backend.
|
||||
|
||||
zswapped
|
||||
Amount of application memory swapped out to zswap.
|
||||
|
||||
file_mapped
|
||||
Amount of cached filesystem data mapped with mmap()
|
||||
|
||||
@@ -1516,6 +1550,21 @@ PAGE_SIZE multiple when read back.
|
||||
higher than the limit for an extended period of time. This
|
||||
reduces the impact on the workload and memory management.
|
||||
|
||||
memory.zswap.current
|
||||
A read-only single value file which exists on non-root
|
||||
cgroups.
|
||||
|
||||
The total amount of memory consumed by the zswap compression
|
||||
backend.
|
||||
|
||||
memory.zswap.max
|
||||
A read-write single value file which exists on non-root
|
||||
cgroups. The default is "max".
|
||||
|
||||
Zswap usage hard limit. If a cgroup's zswap pool reaches this
|
||||
limit, it will refuse to take any more stores before existing
|
||||
entries fault back in or are written out to disk.
|
||||
|
||||
memory.pressure
|
||||
A read-only nested-keyed file.
|
||||
|
||||
|
||||
@@ -1705,16 +1705,16 @@
|
||||
boot-time allocation of gigantic hugepages is skipped.
|
||||
|
||||
hugetlb_free_vmemmap=
|
||||
[KNL] Reguires CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
|
||||
[KNL] Reguires CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||
enabled.
|
||||
Allows heavy hugetlb users to free up some more
|
||||
memory (7 * PAGE_SIZE for each 2MB hugetlb page).
|
||||
Format: { on | off (default) }
|
||||
Format: { [oO][Nn]/Y/y/1 | [oO][Ff]/N/n/0 (default) }
|
||||
|
||||
on: enable the feature
|
||||
off: disable the feature
|
||||
[oO][Nn]/Y/y/1: enable the feature
|
||||
[oO][Ff]/N/n/0: disable the feature
|
||||
|
||||
Built with CONFIG_HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON=y,
|
||||
Built with CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON=y,
|
||||
the default is on.
|
||||
|
||||
This is not compatible with memory_hotplug.memmap_on_memory.
|
||||
|
||||
@@ -66,6 +66,17 @@ Setting it as ``N`` disables DAMON_RECLAIM. Note that DAMON_RECLAIM could do
|
||||
no real monitoring and reclamation due to the watermarks-based activation
|
||||
condition. Refer to below descriptions for the watermarks parameter for this.
|
||||
|
||||
commit_inputs
|
||||
-------------
|
||||
|
||||
Make DAMON_RECLAIM reads the input parameters again, except ``enabled``.
|
||||
|
||||
Input parameters that updated while DAMON_RECLAIM is running are not applied
|
||||
by default. Once this parameter is set as ``Y``, DAMON_RECLAIM reads values
|
||||
of parametrs except ``enabled`` again. Once the re-reading is done, this
|
||||
parameter is set as ``N``. If invalid parameters are found while the
|
||||
re-reading, DAMON_RECLAIM will be disabled.
|
||||
|
||||
min_age
|
||||
-------
|
||||
|
||||
|
||||
@@ -68,7 +68,7 @@ comma (","). ::
|
||||
│ kdamonds/nr_kdamonds
|
||||
│ │ 0/state,pid
|
||||
│ │ │ contexts/nr_contexts
|
||||
│ │ │ │ 0/operations
|
||||
│ │ │ │ 0/avail_operations,operations
|
||||
│ │ │ │ │ monitoring_attrs/
|
||||
│ │ │ │ │ │ intervals/sample_us,aggr_us,update_us
|
||||
│ │ │ │ │ │ nr_regions/min,max
|
||||
@@ -121,10 +121,11 @@ In each kdamond directory, two files (``state`` and ``pid``) and one directory
|
||||
|
||||
Reading ``state`` returns ``on`` if the kdamond is currently running, or
|
||||
``off`` if it is not running. Writing ``on`` or ``off`` makes the kdamond be
|
||||
in the state. Writing ``update_schemes_stats`` to ``state`` file updates the
|
||||
contents of stats files for each DAMON-based operation scheme of the kdamond.
|
||||
For details of the stats, please refer to :ref:`stats section
|
||||
<sysfs_schemes_stats>`.
|
||||
in the state. Writing ``commit`` to the ``state`` file makes kdamond reads the
|
||||
user inputs in the sysfs files except ``state`` file again. Writing
|
||||
``update_schemes_stats`` to ``state`` file updates the contents of stats files
|
||||
for each DAMON-based operation scheme of the kdamond. For details of the
|
||||
stats, please refer to :ref:`stats section <sysfs_schemes_stats>`.
|
||||
|
||||
If the state is ``on``, reading ``pid`` shows the pid of the kdamond thread.
|
||||
|
||||
@@ -143,17 +144,28 @@ be written to the file.
|
||||
contexts/<N>/
|
||||
-------------
|
||||
|
||||
In each context directory, one file (``operations``) and three directories
|
||||
(``monitoring_attrs``, ``targets``, and ``schemes``) exist.
|
||||
In each context directory, two files (``avail_operations`` and ``operations``)
|
||||
and three directories (``monitoring_attrs``, ``targets``, and ``schemes``)
|
||||
exist.
|
||||
|
||||
DAMON supports multiple types of monitoring operations, including those for
|
||||
virtual address space and the physical address space. You can set and get what
|
||||
type of monitoring operations DAMON will use for the context by writing one of
|
||||
below keywords to, and reading from the file.
|
||||
virtual address space and the physical address space. You can get the list of
|
||||
available monitoring operations set on the currently running kernel by reading
|
||||
``avail_operations`` file. Based on the kernel configuration, the file will
|
||||
list some or all of below keywords.
|
||||
|
||||
- vaddr: Monitor virtual address spaces of specific processes
|
||||
- fvaddr: Monitor fixed virtual address ranges
|
||||
- paddr: Monitor the physical address space of the system
|
||||
|
||||
Please refer to :ref:`regions sysfs directory <sysfs_regions>` for detailed
|
||||
differences between the operations sets in terms of the monitoring target
|
||||
regions.
|
||||
|
||||
You can set and get what type of monitoring operations DAMON will use for the
|
||||
context by writing one of the keywords listed in ``avail_operations`` file and
|
||||
reading from the ``operations`` file.
|
||||
|
||||
contexts/<N>/monitoring_attrs/
|
||||
------------------------------
|
||||
|
||||
@@ -192,6 +204,8 @@ If you wrote ``vaddr`` to the ``contexts/<N>/operations``, each target should
|
||||
be a process. You can specify the process to DAMON by writing the pid of the
|
||||
process to the ``pid_target`` file.
|
||||
|
||||
.. _sysfs_regions:
|
||||
|
||||
targets/<N>/regions
|
||||
-------------------
|
||||
|
||||
@@ -202,9 +216,10 @@ can be covered. However, users could want to set the initial monitoring region
|
||||
to specific address ranges.
|
||||
|
||||
In contrast, DAMON do not automatically sets and updates the monitoring target
|
||||
regions when ``paddr`` monitoring operations set is being used (``paddr`` is
|
||||
written to the ``contexts/<N>/operations``). Therefore, users should set the
|
||||
monitoring target regions by themselves in the case.
|
||||
regions when ``fvaddr`` or ``paddr`` monitoring operations sets are being used
|
||||
(``fvaddr`` or ``paddr`` have written to the ``contexts/<N>/operations``).
|
||||
Therefore, users should set the monitoring target regions by themselves in the
|
||||
cases.
|
||||
|
||||
For such cases, users can explicitly set the initial monitoring target regions
|
||||
as they want, by writing proper values to the files under this directory.
|
||||
|
||||
@@ -164,7 +164,7 @@ default_hugepagesz
|
||||
will all result in 256 2M huge pages being allocated. Valid default
|
||||
huge page size is architecture dependent.
|
||||
hugetlb_free_vmemmap
|
||||
When CONFIG_HUGETLB_PAGE_FREE_VMEMMAP is set, this enables freeing
|
||||
When CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is set, this enables optimizing
|
||||
unused vmemmap pages associated with each HugeTLB page.
|
||||
|
||||
When multiple huge page sizes are supported, ``/proc/sys/vm/nr_hugepages``
|
||||
|
||||
@@ -184,6 +184,24 @@ The maximum possible ``pages_sharing/pages_shared`` ratio is limited by the
|
||||
``max_page_sharing`` tunable. To increase the ratio ``max_page_sharing`` must
|
||||
be increased accordingly.
|
||||
|
||||
Monitoring KSM events
|
||||
=====================
|
||||
|
||||
There are some counters in /proc/vmstat that may be used to monitor KSM events.
|
||||
KSM might help save memory, it's a tradeoff by may suffering delay on KSM COW
|
||||
or on swapping in copy. Those events could help users evaluate whether or how
|
||||
to use KSM. For example, if cow_ksm increases too fast, user may decrease the
|
||||
range of madvise(, , MADV_MERGEABLE).
|
||||
|
||||
cow_ksm
|
||||
is incremented every time a KSM page triggers copy on write (COW)
|
||||
when users try to write to a KSM page, we have to make a copy.
|
||||
|
||||
ksm_swpin_copy
|
||||
is incremented every time a KSM page is copied when swapping in
|
||||
note that KSM page might be copied when swapping in because do_swap_page()
|
||||
cannot do all the locking needed to reconstitute a cross-anon_vma KSM page.
|
||||
|
||||
--
|
||||
Izik Eidus,
|
||||
Hugh Dickins, 17 Nov 2009
|
||||
|
||||
@@ -62,6 +62,7 @@ Currently, these files are in /proc/sys/vm:
|
||||
- overcommit_memory
|
||||
- overcommit_ratio
|
||||
- page-cluster
|
||||
- page_lock_unfairness
|
||||
- panic_on_oom
|
||||
- percpu_pagelist_high_fraction
|
||||
- stat_interval
|
||||
@@ -561,6 +562,45 @@ Change the minimum size of the hugepage pool.
|
||||
See Documentation/admin-guide/mm/hugetlbpage.rst
|
||||
|
||||
|
||||
hugetlb_optimize_vmemmap
|
||||
========================
|
||||
|
||||
This knob is not available when memory_hotplug.memmap_on_memory (kernel parameter)
|
||||
is configured or the size of 'struct page' (a structure defined in
|
||||
include/linux/mm_types.h) is not power of two (an unusual system config could
|
||||
result in this).
|
||||
|
||||
Enable (set to 1) or disable (set to 0) the feature of optimizing vmemmap pages
|
||||
associated with each HugeTLB page.
|
||||
|
||||
Once enabled, the vmemmap pages of subsequent allocation of HugeTLB pages from
|
||||
buddy allocator will be optimized (7 pages per 2MB HugeTLB page and 4095 pages
|
||||
per 1GB HugeTLB page), whereas already allocated HugeTLB pages will not be
|
||||
optimized. When those optimized HugeTLB pages are freed from the HugeTLB pool
|
||||
to the buddy allocator, the vmemmap pages representing that range needs to be
|
||||
remapped again and the vmemmap pages discarded earlier need to be rellocated
|
||||
again. If your use case is that HugeTLB pages are allocated 'on the fly' (e.g.
|
||||
never explicitly allocating HugeTLB pages with 'nr_hugepages' but only set
|
||||
'nr_overcommit_hugepages', those overcommitted HugeTLB pages are allocated 'on
|
||||
the fly') instead of being pulled from the HugeTLB pool, you should weigh the
|
||||
benefits of memory savings against the more overhead (~2x slower than before)
|
||||
of allocation or freeing HugeTLB pages between the HugeTLB pool and the buddy
|
||||
allocator. Another behavior to note is that if the system is under heavy memory
|
||||
pressure, it could prevent the user from freeing HugeTLB pages from the HugeTLB
|
||||
pool to the buddy allocator since the allocation of vmemmap pages could be
|
||||
failed, you have to retry later if your system encounter this situation.
|
||||
|
||||
Once disabled, the vmemmap pages of subsequent allocation of HugeTLB pages from
|
||||
buddy allocator will not be optimized meaning the extra overhead at allocation
|
||||
time from buddy allocator disappears, whereas already optimized HugeTLB pages
|
||||
will not be affected. If you want to make sure there are no optimized HugeTLB
|
||||
pages, you can set "nr_hugepages" to 0 first and then disable this. Note that
|
||||
writing 0 to nr_hugepages will make any "in use" HugeTLB pages become surplus
|
||||
pages. So, those surplus pages are still optimized until they are no longer
|
||||
in use. You would need to wait for those surplus pages to be released before
|
||||
there are no optimized pages in the system.
|
||||
|
||||
|
||||
nr_hugepages_mempolicy
|
||||
======================
|
||||
|
||||
@@ -754,6 +794,14 @@ extra faults and I/O delays for following faults if they would have been part of
|
||||
that consecutive pages readahead would have brought in.
|
||||
|
||||
|
||||
page_lock_unfairness
|
||||
====================
|
||||
|
||||
This value determines the number of times that the page lock can be
|
||||
stolen from under a waiter. After the lock is stolen the number of times
|
||||
specified in this file (default is 5), the "fair lock handoff" semantics
|
||||
will apply, and the waiter will only be awakened if the lock can be taken.
|
||||
|
||||
panic_on_oom
|
||||
============
|
||||
|
||||
|
||||
@@ -4,39 +4,76 @@ The Kernel Address Sanitizer (KASAN)
|
||||
Overview
|
||||
--------
|
||||
|
||||
KernelAddressSANitizer (KASAN) is a dynamic memory safety error detector
|
||||
designed to find out-of-bound and use-after-free bugs. KASAN has three modes:
|
||||
Kernel Address Sanitizer (KASAN) is a dynamic memory safety error detector
|
||||
designed to find out-of-bounds and use-after-free bugs.
|
||||
|
||||
1. generic KASAN (similar to userspace ASan),
|
||||
2. software tag-based KASAN (similar to userspace HWASan),
|
||||
3. hardware tag-based KASAN (based on hardware memory tagging).
|
||||
KASAN has three modes:
|
||||
|
||||
Generic KASAN is mainly used for debugging due to a large memory overhead.
|
||||
Software tag-based KASAN can be used for dogfood testing as it has a lower
|
||||
memory overhead that allows using it with real workloads. Hardware tag-based
|
||||
KASAN comes with low memory and performance overheads and, therefore, can be
|
||||
used in production. Either as an in-field memory bug detector or as a security
|
||||
mitigation.
|
||||
1. Generic KASAN
|
||||
2. Software Tag-Based KASAN
|
||||
3. Hardware Tag-Based KASAN
|
||||
|
||||
Software KASAN modes (#1 and #2) use compile-time instrumentation to insert
|
||||
validity checks before every memory access and, therefore, require a compiler
|
||||
version that supports that.
|
||||
Generic KASAN, enabled with CONFIG_KASAN_GENERIC, is the mode intended for
|
||||
debugging, similar to userspace ASan. This mode is supported on many CPU
|
||||
architectures, but it has significant performance and memory overheads.
|
||||
|
||||
Generic KASAN is supported in GCC and Clang. With GCC, it requires version
|
||||
8.3.0 or later. Any supported Clang version is compatible, but detection of
|
||||
out-of-bounds accesses for global variables is only supported since Clang 11.
|
||||
Software Tag-Based KASAN or SW_TAGS KASAN, enabled with CONFIG_KASAN_SW_TAGS,
|
||||
can be used for both debugging and dogfood testing, similar to userspace HWASan.
|
||||
This mode is only supported for arm64, but its moderate memory overhead allows
|
||||
using it for testing on memory-restricted devices with real workloads.
|
||||
|
||||
Software tag-based KASAN mode is only supported in Clang.
|
||||
Hardware Tag-Based KASAN or HW_TAGS KASAN, enabled with CONFIG_KASAN_HW_TAGS,
|
||||
is the mode intended to be used as an in-field memory bug detector or as a
|
||||
security mitigation. This mode only works on arm64 CPUs that support MTE
|
||||
(Memory Tagging Extension), but it has low memory and performance overheads and
|
||||
thus can be used in production.
|
||||
|
||||
The hardware KASAN mode (#3) relies on hardware to perform the checks but
|
||||
still requires a compiler version that supports memory tagging instructions.
|
||||
This mode is supported in GCC 10+ and Clang 12+.
|
||||
For details about the memory and performance impact of each KASAN mode, see the
|
||||
descriptions of the corresponding Kconfig options.
|
||||
|
||||
Both software KASAN modes work with SLUB and SLAB memory allocators,
|
||||
while the hardware tag-based KASAN currently only supports SLUB.
|
||||
The Generic and the Software Tag-Based modes are commonly referred to as the
|
||||
software modes. The Software Tag-Based and the Hardware Tag-Based modes are
|
||||
referred to as the tag-based modes.
|
||||
|
||||
Currently, generic KASAN is supported for the x86_64, arm, arm64, xtensa, s390,
|
||||
and riscv architectures, and tag-based KASAN modes are supported only for arm64.
|
||||
Support
|
||||
-------
|
||||
|
||||
Architectures
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
Generic KASAN is supported on x86_64, arm, arm64, powerpc, riscv, s390, and
|
||||
xtensa, and the tag-based KASAN modes are supported only on arm64.
|
||||
|
||||
Compilers
|
||||
~~~~~~~~~
|
||||
|
||||
Software KASAN modes use compile-time instrumentation to insert validity checks
|
||||
before every memory access and thus require a compiler version that provides
|
||||
support for that. The Hardware Tag-Based mode relies on hardware to perform
|
||||
these checks but still requires a compiler version that supports the memory
|
||||
tagging instructions.
|
||||
|
||||
Generic KASAN requires GCC version 8.3.0 or later
|
||||
or any Clang version supported by the kernel.
|
||||
|
||||
Software Tag-Based KASAN requires GCC 11+
|
||||
or any Clang version supported by the kernel.
|
||||
|
||||
Hardware Tag-Based KASAN requires GCC 10+ or Clang 12+.
|
||||
|
||||
Memory types
|
||||
~~~~~~~~~~~~
|
||||
|
||||
Generic KASAN supports finding bugs in all of slab, page_alloc, vmap, vmalloc,
|
||||
stack, and global memory.
|
||||
|
||||
Software Tag-Based KASAN supports slab, page_alloc, vmalloc, and stack memory.
|
||||
|
||||
Hardware Tag-Based KASAN supports slab, page_alloc, and non-executable vmalloc
|
||||
memory.
|
||||
|
||||
For slab, both software KASAN modes support SLUB and SLAB allocators, while
|
||||
Hardware Tag-Based KASAN only supports SLUB.
|
||||
|
||||
Usage
|
||||
-----
|
||||
@@ -45,18 +82,59 @@ To enable KASAN, configure the kernel with::
|
||||
|
||||
CONFIG_KASAN=y
|
||||
|
||||
and choose between ``CONFIG_KASAN_GENERIC`` (to enable generic KASAN),
|
||||
``CONFIG_KASAN_SW_TAGS`` (to enable software tag-based KASAN), and
|
||||
``CONFIG_KASAN_HW_TAGS`` (to enable hardware tag-based KASAN).
|
||||
and choose between ``CONFIG_KASAN_GENERIC`` (to enable Generic KASAN),
|
||||
``CONFIG_KASAN_SW_TAGS`` (to enable Software Tag-Based KASAN), and
|
||||
``CONFIG_KASAN_HW_TAGS`` (to enable Hardware Tag-Based KASAN).
|
||||
|
||||
For software modes, also choose between ``CONFIG_KASAN_OUTLINE`` and
|
||||
For the software modes, also choose between ``CONFIG_KASAN_OUTLINE`` and
|
||||
``CONFIG_KASAN_INLINE``. Outline and inline are compiler instrumentation types.
|
||||
The former produces a smaller binary while the latter is 1.1-2 times faster.
|
||||
The former produces a smaller binary while the latter is up to 2 times faster.
|
||||
|
||||
To include alloc and free stack traces of affected slab objects into reports,
|
||||
enable ``CONFIG_STACKTRACE``. To include alloc and free stack traces of affected
|
||||
physical pages, enable ``CONFIG_PAGE_OWNER`` and boot with ``page_owner=on``.
|
||||
|
||||
Boot parameters
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
KASAN is affected by the generic ``panic_on_warn`` command line parameter.
|
||||
When it is enabled, KASAN panics the kernel after printing a bug report.
|
||||
|
||||
By default, KASAN prints a bug report only for the first invalid memory access.
|
||||
With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This
|
||||
effectively disables ``panic_on_warn`` for KASAN reports.
|
||||
|
||||
Alternatively, independent of ``panic_on_warn``, the ``kasan.fault=`` boot
|
||||
parameter can be used to control panic and reporting behaviour:
|
||||
|
||||
- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN
|
||||
report or also panic the kernel (default: ``report``). The panic happens even
|
||||
if ``kasan_multi_shot`` is enabled.
|
||||
|
||||
Hardware Tag-Based KASAN mode (see the section about various modes below) is
|
||||
intended for use in production as a security mitigation. Therefore, it supports
|
||||
additional boot parameters that allow disabling KASAN or controlling features:
|
||||
|
||||
- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
|
||||
|
||||
- ``kasan.mode=sync``, ``=async`` or ``=asymm`` controls whether KASAN
|
||||
is configured in synchronous, asynchronous or asymmetric mode of
|
||||
execution (default: ``sync``).
|
||||
Synchronous mode: a bad access is detected immediately when a tag
|
||||
check fault occurs.
|
||||
Asynchronous mode: a bad access detection is delayed. When a tag check
|
||||
fault occurs, the information is stored in hardware (in the TFSR_EL1
|
||||
register for arm64). The kernel periodically checks the hardware and
|
||||
only reports tag faults during these checks.
|
||||
Asymmetric mode: a bad access is detected synchronously on reads and
|
||||
asynchronously on writes.
|
||||
|
||||
- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc
|
||||
allocations (default: ``on``).
|
||||
|
||||
- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
|
||||
traces collection (default: ``on``).
|
||||
|
||||
Error reports
|
||||
~~~~~~~~~~~~~
|
||||
|
||||
@@ -146,7 +224,7 @@ is either 8 or 16 aligned bytes depending on KASAN mode. Each number in the
|
||||
memory state section of the report shows the state of one of the memory
|
||||
granules that surround the accessed address.
|
||||
|
||||
For generic KASAN, the size of each memory granule is 8. The state of each
|
||||
For Generic KASAN, the size of each memory granule is 8. The state of each
|
||||
granule is encoded in one shadow byte. Those 8 bytes can be accessible,
|
||||
partially accessible, freed, or be a part of a redzone. KASAN uses the following
|
||||
encoding for each shadow byte: 00 means that all 8 bytes of the corresponding
|
||||
@@ -171,47 +249,6 @@ traces point to places in code that interacted with the object but that are not
|
||||
directly present in the bad access stack trace. Currently, this includes
|
||||
call_rcu() and workqueue queuing.
|
||||
|
||||
Boot parameters
|
||||
~~~~~~~~~~~~~~~
|
||||
|
||||
KASAN is affected by the generic ``panic_on_warn`` command line parameter.
|
||||
When it is enabled, KASAN panics the kernel after printing a bug report.
|
||||
|
||||
By default, KASAN prints a bug report only for the first invalid memory access.
|
||||
With ``kasan_multi_shot``, KASAN prints a report on every invalid access. This
|
||||
effectively disables ``panic_on_warn`` for KASAN reports.
|
||||
|
||||
Alternatively, independent of ``panic_on_warn`` the ``kasan.fault=`` boot
|
||||
parameter can be used to control panic and reporting behaviour:
|
||||
|
||||
- ``kasan.fault=report`` or ``=panic`` controls whether to only print a KASAN
|
||||
report or also panic the kernel (default: ``report``). The panic happens even
|
||||
if ``kasan_multi_shot`` is enabled.
|
||||
|
||||
Hardware tag-based KASAN mode (see the section about various modes below) is
|
||||
intended for use in production as a security mitigation. Therefore, it supports
|
||||
additional boot parameters that allow disabling KASAN or controlling features:
|
||||
|
||||
- ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``).
|
||||
|
||||
- ``kasan.mode=sync``, ``=async`` or ``=asymm`` controls whether KASAN
|
||||
is configured in synchronous, asynchronous or asymmetric mode of
|
||||
execution (default: ``sync``).
|
||||
Synchronous mode: a bad access is detected immediately when a tag
|
||||
check fault occurs.
|
||||
Asynchronous mode: a bad access detection is delayed. When a tag check
|
||||
fault occurs, the information is stored in hardware (in the TFSR_EL1
|
||||
register for arm64). The kernel periodically checks the hardware and
|
||||
only reports tag faults during these checks.
|
||||
Asymmetric mode: a bad access is detected synchronously on reads and
|
||||
asynchronously on writes.
|
||||
|
||||
- ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc
|
||||
allocations (default: ``on``).
|
||||
|
||||
- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack
|
||||
traces collection (default: ``on``).
|
||||
|
||||
Implementation details
|
||||
----------------------
|
||||
|
||||
@@ -250,49 +287,46 @@ outline-instrumented kernel.
|
||||
Generic KASAN is the only mode that delays the reuse of freed objects via
|
||||
quarantine (see mm/kasan/quarantine.c for implementation).
|
||||
|
||||
Software tag-based KASAN
|
||||
Software Tag-Based KASAN
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Software tag-based KASAN uses a software memory tagging approach to checking
|
||||
Software Tag-Based KASAN uses a software memory tagging approach to checking
|
||||
access validity. It is currently only implemented for the arm64 architecture.
|
||||
|
||||
Software tag-based KASAN uses the Top Byte Ignore (TBI) feature of arm64 CPUs
|
||||
Software Tag-Based KASAN uses the Top Byte Ignore (TBI) feature of arm64 CPUs
|
||||
to store a pointer tag in the top byte of kernel pointers. It uses shadow memory
|
||||
to store memory tags associated with each 16-byte memory cell (therefore, it
|
||||
dedicates 1/16th of the kernel memory for shadow memory).
|
||||
|
||||
On each memory allocation, software tag-based KASAN generates a random tag, tags
|
||||
On each memory allocation, Software Tag-Based KASAN generates a random tag, tags
|
||||
the allocated memory with this tag, and embeds the same tag into the returned
|
||||
pointer.
|
||||
|
||||
Software tag-based KASAN uses compile-time instrumentation to insert checks
|
||||
Software Tag-Based KASAN uses compile-time instrumentation to insert checks
|
||||
before each memory access. These checks make sure that the tag of the memory
|
||||
that is being accessed is equal to the tag of the pointer that is used to access
|
||||
this memory. In case of a tag mismatch, software tag-based KASAN prints a bug
|
||||
this memory. In case of a tag mismatch, Software Tag-Based KASAN prints a bug
|
||||
report.
|
||||
|
||||
Software tag-based KASAN also has two instrumentation modes (outline, which
|
||||
Software Tag-Based KASAN also has two instrumentation modes (outline, which
|
||||
emits callbacks to check memory accesses; and inline, which performs the shadow
|
||||
memory checks inline). With outline instrumentation mode, a bug report is
|
||||
printed from the function that performs the access check. With inline
|
||||
instrumentation, a ``brk`` instruction is emitted by the compiler, and a
|
||||
dedicated ``brk`` handler is used to print bug reports.
|
||||
|
||||
Software tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through
|
||||
Software Tag-Based KASAN uses 0xFF as a match-all pointer tag (accesses through
|
||||
pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently
|
||||
reserved to tag freed memory regions.
|
||||
|
||||
Software tag-based KASAN currently only supports tagging of slab, page_alloc,
|
||||
and vmalloc memory.
|
||||
|
||||
Hardware tag-based KASAN
|
||||
Hardware Tag-Based KASAN
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Hardware tag-based KASAN is similar to the software mode in concept but uses
|
||||
Hardware Tag-Based KASAN is similar to the software mode in concept but uses
|
||||
hardware memory tagging support instead of compiler instrumentation and
|
||||
shadow memory.
|
||||
|
||||
Hardware tag-based KASAN is currently only implemented for arm64 architecture
|
||||
Hardware Tag-Based KASAN is currently only implemented for arm64 architecture
|
||||
and based on both arm64 Memory Tagging Extension (MTE) introduced in ARMv8.5
|
||||
Instruction Set Architecture and Top Byte Ignore (TBI).
|
||||
|
||||
@@ -302,21 +336,18 @@ access, hardware makes sure that the tag of the memory that is being accessed is
|
||||
equal to the tag of the pointer that is used to access this memory. In case of a
|
||||
tag mismatch, a fault is generated, and a report is printed.
|
||||
|
||||
Hardware tag-based KASAN uses 0xFF as a match-all pointer tag (accesses through
|
||||
Hardware Tag-Based KASAN uses 0xFF as a match-all pointer tag (accesses through
|
||||
pointers with the 0xFF pointer tag are not checked). The value 0xFE is currently
|
||||
reserved to tag freed memory regions.
|
||||
|
||||
Hardware tag-based KASAN currently only supports tagging of slab, page_alloc,
|
||||
and VM_ALLOC-based vmalloc memory.
|
||||
|
||||
If the hardware does not support MTE (pre ARMv8.5), hardware tag-based KASAN
|
||||
If the hardware does not support MTE (pre ARMv8.5), Hardware Tag-Based KASAN
|
||||
will not be enabled. In this case, all KASAN boot parameters are ignored.
|
||||
|
||||
Note that enabling CONFIG_KASAN_HW_TAGS always results in in-kernel TBI being
|
||||
enabled. Even when ``kasan.mode=off`` is provided or when the hardware does not
|
||||
support MTE (but supports TBI).
|
||||
|
||||
Hardware tag-based KASAN only reports the first found bug. After that, MTE tag
|
||||
Hardware Tag-Based KASAN only reports the first found bug. After that, MTE tag
|
||||
checking gets disabled.
|
||||
|
||||
Shadow memory
|
||||
@@ -414,19 +445,18 @@ generic ``noinstr`` one.
|
||||
Note that disabling compiler instrumentation (either on a per-file or a
|
||||
per-function basis) makes KASAN ignore the accesses that happen directly in
|
||||
that code for software KASAN modes. It does not help when the accesses happen
|
||||
indirectly (through calls to instrumented functions) or with the hardware
|
||||
tag-based mode that does not use compiler instrumentation.
|
||||
indirectly (through calls to instrumented functions) or with Hardware
|
||||
Tag-Based KASAN, which does not use compiler instrumentation.
|
||||
|
||||
For software KASAN modes, to disable KASAN reports in a part of the kernel code
|
||||
for the current task, annotate this part of the code with a
|
||||
``kasan_disable_current()``/``kasan_enable_current()`` section. This also
|
||||
disables the reports for indirect accesses that happen through function calls.
|
||||
|
||||
For tag-based KASAN modes (include the hardware one), to disable access
|
||||
checking, use ``kasan_reset_tag()`` or ``page_kasan_tag_reset()``. Note that
|
||||
temporarily disabling access checking via ``page_kasan_tag_reset()`` requires
|
||||
saving and restoring the per-page KASAN tag via
|
||||
``page_kasan_tag``/``page_kasan_tag_set``.
|
||||
For tag-based KASAN modes, to disable access checking, use
|
||||
``kasan_reset_tag()`` or ``page_kasan_tag_reset()``. Note that temporarily
|
||||
disabling access checking via ``page_kasan_tag_reset()`` requires saving and
|
||||
restoring the per-page KASAN tag via ``page_kasan_tag``/``page_kasan_tag_set``.
|
||||
|
||||
Tests
|
||||
~~~~~
|
||||
|
||||
@@ -258,8 +258,9 @@ prototypes::
|
||||
int (*launder_folio)(struct folio *);
|
||||
bool (*is_partially_uptodate)(struct folio *, size_t from, size_t count);
|
||||
int (*error_remove_page)(struct address_space *, struct page *);
|
||||
int (*swap_activate)(struct file *);
|
||||
int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span)
|
||||
int (*swap_deactivate)(struct file *);
|
||||
int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
|
||||
|
||||
locking rules:
|
||||
All except dirty_folio and free_folio may block
|
||||
@@ -287,6 +288,7 @@ is_partially_uptodate: yes
|
||||
error_remove_page: yes
|
||||
swap_activate: no
|
||||
swap_deactivate: no
|
||||
swap_rw: yes, unlocks
|
||||
====================== ======================== ========= ===============
|
||||
|
||||
->write_begin(), ->write_end() and ->read_folio() may be called from
|
||||
@@ -386,15 +388,19 @@ cleaned, or an error value if not. Note that in order to prevent the folio
|
||||
getting mapped back in and redirtied, it needs to be kept locked
|
||||
across the entire operation.
|
||||
|
||||
->swap_activate will be called with a non-zero argument on
|
||||
files backing (non block device backed) swapfiles. A return value
|
||||
of zero indicates success, in which case this file can be used for
|
||||
backing swapspace. The swapspace operations will be proxied to the
|
||||
address space operations.
|
||||
->swap_activate() will be called to prepare the given file for swap. It
|
||||
should perform any validation and preparation necessary to ensure that
|
||||
writes can be performed with minimal memory allocation. It should call
|
||||
add_swap_extent(), or the helper iomap_swapfile_activate(), and return
|
||||
the number of extents added. If IO should be submitted through
|
||||
->swap_rw(), it should set SWP_FS_OPS, otherwise IO will be submitted
|
||||
directly to the block device ``sis->bdev``.
|
||||
|
||||
->swap_deactivate() will be called in the sys_swapoff()
|
||||
path after ->swap_activate() returned success.
|
||||
|
||||
->swap_rw will be called for swap IO if SWP_FS_OPS was set by ->swap_activate().
|
||||
|
||||
file_lock_operations
|
||||
====================
|
||||
|
||||
|
||||
@@ -942,56 +942,73 @@ can be substantial. In many cases there are other means to find out
|
||||
additional memory using subsystem specific interfaces, for instance
|
||||
/proc/net/sockstat for TCP memory allocations.
|
||||
|
||||
The following is from a 16GB PIII, which has highmem enabled.
|
||||
You may not have all of these fields.
|
||||
Example output. You may not have all of these fields.
|
||||
|
||||
::
|
||||
|
||||
> cat /proc/meminfo
|
||||
|
||||
MemTotal: 16344972 kB
|
||||
MemFree: 13634064 kB
|
||||
MemAvailable: 14836172 kB
|
||||
Buffers: 3656 kB
|
||||
Cached: 1195708 kB
|
||||
SwapCached: 0 kB
|
||||
Active: 891636 kB
|
||||
Inactive: 1077224 kB
|
||||
HighTotal: 15597528 kB
|
||||
HighFree: 13629632 kB
|
||||
LowTotal: 747444 kB
|
||||
LowFree: 4432 kB
|
||||
SwapTotal: 0 kB
|
||||
SwapFree: 0 kB
|
||||
Dirty: 968 kB
|
||||
Writeback: 0 kB
|
||||
AnonPages: 861800 kB
|
||||
Mapped: 280372 kB
|
||||
Shmem: 644 kB
|
||||
KReclaimable: 168048 kB
|
||||
Slab: 284364 kB
|
||||
SReclaimable: 159856 kB
|
||||
SUnreclaim: 124508 kB
|
||||
PageTables: 24448 kB
|
||||
NFS_Unstable: 0 kB
|
||||
Bounce: 0 kB
|
||||
WritebackTmp: 0 kB
|
||||
CommitLimit: 7669796 kB
|
||||
Committed_AS: 100056 kB
|
||||
VmallocTotal: 112216 kB
|
||||
VmallocUsed: 428 kB
|
||||
VmallocChunk: 111088 kB
|
||||
Percpu: 62080 kB
|
||||
HardwareCorrupted: 0 kB
|
||||
AnonHugePages: 49152 kB
|
||||
ShmemHugePages: 0 kB
|
||||
ShmemPmdMapped: 0 kB
|
||||
MemTotal: 32858820 kB
|
||||
MemFree: 21001236 kB
|
||||
MemAvailable: 27214312 kB
|
||||
Buffers: 581092 kB
|
||||
Cached: 5587612 kB
|
||||
SwapCached: 0 kB
|
||||
Active: 3237152 kB
|
||||
Inactive: 7586256 kB
|
||||
Active(anon): 94064 kB
|
||||
Inactive(anon): 4570616 kB
|
||||
Active(file): 3143088 kB
|
||||
Inactive(file): 3015640 kB
|
||||
Unevictable: 0 kB
|
||||
Mlocked: 0 kB
|
||||
SwapTotal: 0 kB
|
||||
SwapFree: 0 kB
|
||||
Zswap: 1904 kB
|
||||
Zswapped: 7792 kB
|
||||
Dirty: 12 kB
|
||||
Writeback: 0 kB
|
||||
AnonPages: 4654780 kB
|
||||
Mapped: 266244 kB
|
||||
Shmem: 9976 kB
|
||||
KReclaimable: 517708 kB
|
||||
Slab: 660044 kB
|
||||
SReclaimable: 517708 kB
|
||||
SUnreclaim: 142336 kB
|
||||
KernelStack: 11168 kB
|
||||
PageTables: 20540 kB
|
||||
NFS_Unstable: 0 kB
|
||||
Bounce: 0 kB
|
||||
WritebackTmp: 0 kB
|
||||
CommitLimit: 16429408 kB
|
||||
Committed_AS: 7715148 kB
|
||||
VmallocTotal: 34359738367 kB
|
||||
VmallocUsed: 40444 kB
|
||||
VmallocChunk: 0 kB
|
||||
Percpu: 29312 kB
|
||||
HardwareCorrupted: 0 kB
|
||||
AnonHugePages: 4149248 kB
|
||||
ShmemHugePages: 0 kB
|
||||
ShmemPmdMapped: 0 kB
|
||||
FileHugePages: 0 kB
|
||||
FilePmdMapped: 0 kB
|
||||
CmaTotal: 0 kB
|
||||
CmaFree: 0 kB
|
||||
HugePages_Total: 0
|
||||
HugePages_Free: 0
|
||||
HugePages_Rsvd: 0
|
||||
HugePages_Surp: 0
|
||||
Hugepagesize: 2048 kB
|
||||
Hugetlb: 0 kB
|
||||
DirectMap4k: 401152 kB
|
||||
DirectMap2M: 10008576 kB
|
||||
DirectMap1G: 24117248 kB
|
||||
|
||||
MemTotal
|
||||
Total usable RAM (i.e. physical RAM minus a few reserved
|
||||
bits and the kernel binary code)
|
||||
MemFree
|
||||
The sum of LowFree+HighFree
|
||||
Total free RAM. On highmem systems, the sum of LowFree+HighFree
|
||||
MemAvailable
|
||||
An estimate of how much memory is available for starting new
|
||||
applications, without swapping. Calculated from MemFree,
|
||||
@@ -1005,8 +1022,9 @@ Buffers
|
||||
Relatively temporary storage for raw disk blocks
|
||||
shouldn't get tremendously large (20MB or so)
|
||||
Cached
|
||||
in-memory cache for files read from the disk (the
|
||||
pagecache). Doesn't include SwapCached
|
||||
In-memory cache for files read from the disk (the
|
||||
pagecache) as well as tmpfs & shmem.
|
||||
Doesn't include SwapCached.
|
||||
SwapCached
|
||||
Memory that once was swapped out, is swapped back in but
|
||||
still also is in the swapfile (if memory is needed it
|
||||
@@ -1018,6 +1036,11 @@ Active
|
||||
Inactive
|
||||
Memory which has been less recently used. It is more
|
||||
eligible to be reclaimed for other purposes
|
||||
Unevictable
|
||||
Memory allocated for userspace which cannot be reclaimed, such
|
||||
as mlocked pages, ramfs backing pages, secret memfd pages etc.
|
||||
Mlocked
|
||||
Memory locked with mlock().
|
||||
HighTotal, HighFree
|
||||
Highmem is all memory above ~860MB of physical memory.
|
||||
Highmem areas are for use by userspace programs, or
|
||||
@@ -1034,26 +1057,20 @@ SwapTotal
|
||||
SwapFree
|
||||
Memory which has been evicted from RAM, and is temporarily
|
||||
on the disk
|
||||
Zswap
|
||||
Memory consumed by the zswap backend (compressed size)
|
||||
Zswapped
|
||||
Amount of anonymous memory stored in zswap (original size)
|
||||
Dirty
|
||||
Memory which is waiting to get written back to the disk
|
||||
Writeback
|
||||
Memory which is actively being written back to the disk
|
||||
AnonPages
|
||||
Non-file backed pages mapped into userspace page tables
|
||||
HardwareCorrupted
|
||||
The amount of RAM/memory in KB, the kernel identifies as
|
||||
corrupted.
|
||||
AnonHugePages
|
||||
Non-file backed huge pages mapped into userspace page tables
|
||||
Mapped
|
||||
files which have been mmaped, such as libraries
|
||||
Shmem
|
||||
Total memory used by shared memory (shmem) and tmpfs
|
||||
ShmemHugePages
|
||||
Memory used by shared memory (shmem) and tmpfs allocated
|
||||
with huge pages
|
||||
ShmemPmdMapped
|
||||
Shared memory mapped into userspace with huge pages
|
||||
KReclaimable
|
||||
Kernel allocations that the kernel will attempt to reclaim
|
||||
under memory pressure. Includes SReclaimable (below), and other
|
||||
@@ -1064,9 +1081,10 @@ SReclaimable
|
||||
Part of Slab, that might be reclaimed, such as caches
|
||||
SUnreclaim
|
||||
Part of Slab, that cannot be reclaimed on memory pressure
|
||||
KernelStack
|
||||
Memory consumed by the kernel stacks of all tasks
|
||||
PageTables
|
||||
amount of memory dedicated to the lowest level of page
|
||||
tables.
|
||||
Memory consumed by userspace page tables
|
||||
NFS_Unstable
|
||||
Always zero. Previous counted pages which had been written to
|
||||
the server, but has not been committed to stable storage.
|
||||
@@ -1098,7 +1116,7 @@ Committed_AS
|
||||
has been allocated by processes, even if it has not been
|
||||
"used" by them as of yet. A process which malloc()'s 1G
|
||||
of memory, but only touches 300M of it will show up as
|
||||
using 1G. This 1G is memory which has been "committed" to
|
||||
using 1G. This 1G is memory which has been "committed" to
|
||||
by the VM and can be used at any time by the allocating
|
||||
application. With strict overcommit enabled on the system
|
||||
(mode 2 in 'vm.overcommit_memory'), allocations which would
|
||||
@@ -1107,7 +1125,7 @@ Committed_AS
|
||||
not fail due to lack of memory once that memory has been
|
||||
successfully allocated.
|
||||
VmallocTotal
|
||||
total size of vmalloc memory area
|
||||
total size of vmalloc virtual address space
|
||||
VmallocUsed
|
||||
amount of vmalloc area which is used
|
||||
VmallocChunk
|
||||
@@ -1115,6 +1133,30 @@ VmallocChunk
|
||||
Percpu
|
||||
Memory allocated to the percpu allocator used to back percpu
|
||||
allocations. This stat excludes the cost of metadata.
|
||||
HardwareCorrupted
|
||||
The amount of RAM/memory in KB, the kernel identifies as
|
||||
corrupted.
|
||||
AnonHugePages
|
||||
Non-file backed huge pages mapped into userspace page tables
|
||||
ShmemHugePages
|
||||
Memory used by shared memory (shmem) and tmpfs allocated
|
||||
with huge pages
|
||||
ShmemPmdMapped
|
||||
Shared memory mapped into userspace with huge pages
|
||||
FileHugePages
|
||||
Memory used for filesystem data (page cache) allocated
|
||||
with huge pages
|
||||
FilePmdMapped
|
||||
Page cache mapped into userspace with huge pages
|
||||
CmaTotal
|
||||
Memory reserved for the Contiguous Memory Allocator (CMA)
|
||||
CmaFree
|
||||
Free remaining memory in the CMA reserves
|
||||
HugePages_Total, HugePages_Free, HugePages_Rsvd, HugePages_Surp, Hugepagesize, Hugetlb
|
||||
See Documentation/admin-guide/mm/hugetlbpage.rst.
|
||||
DirectMap4k, DirectMap2M, DirectMap1G
|
||||
Breakdown of page table sizes used in the kernel's
|
||||
identity mapping of RAM
|
||||
|
||||
vmallocinfo
|
||||
~~~~~~~~~~~
|
||||
|
||||
@@ -749,8 +749,9 @@ cache in your filesystem. The following members are defined:
|
||||
size_t count);
|
||||
void (*is_dirty_writeback)(struct folio *, bool *, bool *);
|
||||
int (*error_remove_page) (struct mapping *mapping, struct page *page);
|
||||
int (*swap_activate)(struct file *);
|
||||
int (*swap_activate)(struct swap_info_struct *sis, struct file *f, sector_t *span)
|
||||
int (*swap_deactivate)(struct file *);
|
||||
int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
|
||||
};
|
||||
|
||||
``writepage``
|
||||
@@ -948,15 +949,21 @@ cache in your filesystem. The following members are defined:
|
||||
unless you have them locked or reference counts increased.
|
||||
|
||||
``swap_activate``
|
||||
Called when swapon is used on a file to allocate space if
|
||||
necessary and pin the block lookup information in memory. A
|
||||
return value of zero indicates success, in which case this file
|
||||
can be used to back swapspace.
|
||||
|
||||
Called to prepare the given file for swap. It should perform
|
||||
any validation and preparation necessary to ensure that writes
|
||||
can be performed with minimal memory allocation. It should call
|
||||
add_swap_extent(), or the helper iomap_swapfile_activate(), and
|
||||
return the number of extents added. If IO should be submitted
|
||||
through ->swap_rw(), it should set SWP_FS_OPS, otherwise IO will
|
||||
be submitted directly to the block device ``sis->bdev``.
|
||||
|
||||
``swap_deactivate``
|
||||
Called during swapoff on files where swap_activate was
|
||||
successful.
|
||||
|
||||
``swap_rw``
|
||||
Called to read or write swap pages when SWP_FS_OPS is set.
|
||||
|
||||
The File Object
|
||||
===============
|
||||
|
||||
@@ -50,61 +50,74 @@ space when they use mm context tags.
|
||||
Temporary Virtual Mappings
|
||||
==========================
|
||||
|
||||
The kernel contains several ways of creating temporary mappings:
|
||||
The kernel contains several ways of creating temporary mappings. The following
|
||||
list shows them in order of preference of use.
|
||||
|
||||
* vmap(). This can be used to make a long duration mapping of multiple
|
||||
physical pages into a contiguous virtual space. It needs global
|
||||
synchronization to unmap.
|
||||
* kmap_local_page(). This function is used to require short term mappings.
|
||||
It can be invoked from any context (including interrupts) but the mappings
|
||||
can only be used in the context which acquired them.
|
||||
|
||||
* kmap(). This permits a short duration mapping of a single page. It needs
|
||||
global synchronization, but is amortized somewhat. It is also prone to
|
||||
deadlocks when using in a nested fashion, and so it is not recommended for
|
||||
new code.
|
||||
This function should be preferred, where feasible, over all the others.
|
||||
|
||||
These mappings are thread-local and CPU-local, meaning that the mapping
|
||||
can only be accessed from within this thread and the thread is bound the
|
||||
CPU while the mapping is active. Even if the thread is preempted (since
|
||||
preemption is never disabled by the function) the CPU can not be
|
||||
unplugged from the system via CPU-hotplug until the mapping is disposed.
|
||||
|
||||
It's valid to take pagefaults in a local kmap region, unless the context
|
||||
in which the local mapping is acquired does not allow it for other reasons.
|
||||
|
||||
kmap_local_page() always returns a valid virtual address and it is assumed
|
||||
that kunmap_local() will never fail.
|
||||
|
||||
Nesting kmap_local_page() and kmap_atomic() mappings is allowed to a certain
|
||||
extent (up to KMAP_TYPE_NR) but their invocations have to be strictly ordered
|
||||
because the map implementation is stack based. See kmap_local_page() kdocs
|
||||
(included in the "Functions" section) for details on how to manage nested
|
||||
mappings.
|
||||
|
||||
* kmap_atomic(). This permits a very short duration mapping of a single
|
||||
page. Since the mapping is restricted to the CPU that issued it, it
|
||||
performs well, but the issuing task is therefore required to stay on that
|
||||
CPU until it has finished, lest some other task displace its mappings.
|
||||
|
||||
kmap_atomic() may also be used by interrupt contexts, since it is does not
|
||||
sleep and the caller may not sleep until after kunmap_atomic() is called.
|
||||
kmap_atomic() may also be used by interrupt contexts, since it does not
|
||||
sleep and the callers too may not sleep until after kunmap_atomic() is
|
||||
called.
|
||||
|
||||
It may be assumed that k[un]map_atomic() won't fail.
|
||||
Each call of kmap_atomic() in the kernel creates a non-preemptible section
|
||||
and disable pagefaults. This could be a source of unwanted latency. Therefore
|
||||
users should prefer kmap_local_page() instead of kmap_atomic().
|
||||
|
||||
It is assumed that k[un]map_atomic() won't fail.
|
||||
|
||||
Using kmap_atomic
|
||||
=================
|
||||
* kmap(). This should be used to make short duration mapping of a single
|
||||
page with no restrictions on preemption or migration. It comes with an
|
||||
overhead as mapping space is restricted and protected by a global lock
|
||||
for synchronization. When mapping is no longer needed, the address that
|
||||
the page was mapped to must be released with kunmap().
|
||||
|
||||
When and where to use kmap_atomic() is straightforward. It is used when code
|
||||
wants to access the contents of a page that might be allocated from high memory
|
||||
(see __GFP_HIGHMEM), for example a page in the pagecache. The API has two
|
||||
functions, and they can be used in a manner similar to the following::
|
||||
Mapping changes must be propagated across all the CPUs. kmap() also
|
||||
requires global TLB invalidation when the kmap's pool wraps and it might
|
||||
block when the mapping space is fully utilized until a slot becomes
|
||||
available. Therefore, kmap() is only callable from preemptible context.
|
||||
|
||||
/* Find the page of interest. */
|
||||
struct page *page = find_get_page(mapping, offset);
|
||||
All the above work is necessary if a mapping must last for a relatively
|
||||
long time but the bulk of high-memory mappings in the kernel are
|
||||
short-lived and only used in one place. This means that the cost of
|
||||
kmap() is mostly wasted in such cases. kmap() was not intended for long
|
||||
term mappings but it has morphed in that direction and its use is
|
||||
strongly discouraged in newer code and the set of the preceding functions
|
||||
should be preferred.
|
||||
|
||||
/* Gain access to the contents of that page. */
|
||||
void *vaddr = kmap_atomic(page);
|
||||
On 64-bit systems, calls to kmap_local_page(), kmap_atomic() and kmap() have
|
||||
no real work to do because a 64-bit address space is more than sufficient to
|
||||
address all the physical memory whose pages are permanently mapped.
|
||||
|
||||
/* Do something to the contents of that page. */
|
||||
memset(vaddr, 0, PAGE_SIZE);
|
||||
|
||||
/* Unmap that page. */
|
||||
kunmap_atomic(vaddr);
|
||||
|
||||
Note that the kunmap_atomic() call takes the result of the kmap_atomic() call
|
||||
not the argument.
|
||||
|
||||
If you need to map two pages because you want to copy from one page to
|
||||
another you need to keep the kmap_atomic calls strictly nested, like::
|
||||
|
||||
vaddr1 = kmap_atomic(page1);
|
||||
vaddr2 = kmap_atomic(page2);
|
||||
|
||||
memcpy(vaddr1, vaddr2, PAGE_SIZE);
|
||||
|
||||
kunmap_atomic(vaddr2);
|
||||
kunmap_atomic(vaddr1);
|
||||
* vmap(). This can be used to make a long duration mapping of multiple
|
||||
physical pages into a contiguous virtual space. It needs global
|
||||
synchronization to unmap.
|
||||
|
||||
|
||||
Cost of Temporary Mappings
|
||||
@@ -145,3 +158,10 @@ The general recommendation is that you don't use more than 8GiB on a 32-bit
|
||||
machine - although more might work for you and your workload, you're pretty
|
||||
much on your own - don't expect kernel developers to really care much if things
|
||||
come apart.
|
||||
|
||||
|
||||
Functions
|
||||
=========
|
||||
|
||||
.. kernel-doc:: include/linux/highmem.h
|
||||
.. kernel-doc:: include/linux/highmem-internal.h
|
||||
|
||||
@@ -63,5 +63,6 @@ above structured documentation, or deleted if it has served its purpose.
|
||||
transhuge
|
||||
unevictable-lru
|
||||
vmalloced-kernel-stacks
|
||||
vmemmap_dedup
|
||||
z3fold
|
||||
zsmalloc
|
||||
|
||||
@@ -121,6 +121,14 @@ Usage
|
||||
-r Sort by memory release time.
|
||||
-s Sort by stack trace.
|
||||
-t Sort by times (default).
|
||||
--sort <order> Specify sorting order. Sorting syntax is [+|-]key[,[+|-]key[,...]].
|
||||
Choose a key from the **STANDARD FORMAT SPECIFIERS** section. The "+" is
|
||||
optional since default direction is increasing numerical or lexicographic
|
||||
order. Mixed use of abbreviated and complete-form of keys is allowed.
|
||||
|
||||
Examples:
|
||||
./page_owner_sort <input> <output> --sort=n,+pid,-tgid
|
||||
./page_owner_sort <input> <output> --sort=at
|
||||
|
||||
additional function::
|
||||
|
||||
@@ -129,7 +137,6 @@ Usage
|
||||
Specify culling rules.Culling syntax is key[,key[,...]].Choose a
|
||||
multi-letter key from the **STANDARD FORMAT SPECIFIERS** section.
|
||||
|
||||
|
||||
<rules> is a single argument in the form of a comma-separated list,
|
||||
which offers a way to specify individual culling rules. The recognized
|
||||
keywords are described in the **STANDARD FORMAT SPECIFIERS** section below.
|
||||
@@ -137,7 +144,6 @@ Usage
|
||||
the STANDARD SORT KEYS section below. Mixed use of abbreviated and
|
||||
complete-form of keys is allowed.
|
||||
|
||||
|
||||
Examples:
|
||||
./page_owner_sort <input> <output> --cull=stacktrace
|
||||
./page_owner_sort <input> <output> --cull=st,pid,name
|
||||
@@ -147,17 +153,44 @@ Usage
|
||||
-f Filter out the information of blocks whose memory has been released.
|
||||
|
||||
Select:
|
||||
--pid <PID> Select by pid.
|
||||
--tgid <TGID> Select by tgid.
|
||||
--name <command> Select by task command name.
|
||||
--pid <pidlist> Select by pid. This selects the blocks whose process ID
|
||||
numbers appear in <pidlist>.
|
||||
--tgid <tgidlist> Select by tgid. This selects the blocks whose thread
|
||||
group ID numbers appear in <tgidlist>.
|
||||
--name <cmdlist> Select by task command name. This selects the blocks whose
|
||||
task command name appear in <cmdlist>.
|
||||
|
||||
<pidlist>, <tgidlist>, <cmdlist> are single arguments in the form of a comma-separated list,
|
||||
which offers a way to specify individual selecting rules.
|
||||
|
||||
|
||||
Examples:
|
||||
./page_owner_sort <input> <output> --pid=1
|
||||
./page_owner_sort <input> <output> --tgid=1,2,3
|
||||
./page_owner_sort <input> <output> --name name1,name2
|
||||
|
||||
STANDARD FORMAT SPECIFIERS
|
||||
==========================
|
||||
::
|
||||
|
||||
For --sort option:
|
||||
|
||||
KEY LONG DESCRIPTION
|
||||
p pid process ID
|
||||
tg tgid thread group ID
|
||||
n name task command name
|
||||
st stacktrace stack trace of the page allocation
|
||||
T txt full text of block
|
||||
ft free_ts timestamp of the page when it was released
|
||||
at alloc_ts timestamp of the page when it was allocated
|
||||
ator allocator memory allocator for pages
|
||||
|
||||
For --curl option:
|
||||
|
||||
KEY LONG DESCRIPTION
|
||||
p pid process ID
|
||||
tg tgid thread group ID
|
||||
n name task command name
|
||||
f free whether the page has been released or not
|
||||
st stacktrace stace trace of the page allocation
|
||||
st stacktrace stack trace of the page allocation
|
||||
ator allocator memory allocator for pages
|
||||
|
||||
223
Documentation/vm/vmemmap_dedup.rst
Normal file
223
Documentation/vm/vmemmap_dedup.rst
Normal file
@@ -0,0 +1,223 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=========================================
|
||||
A vmemmap diet for HugeTLB and Device DAX
|
||||
=========================================
|
||||
|
||||
HugeTLB
|
||||
=======
|
||||
|
||||
The struct page structures (page structs) are used to describe a physical
|
||||
page frame. By default, there is a one-to-one mapping from a page frame to
|
||||
it's corresponding page struct.
|
||||
|
||||
HugeTLB pages consist of multiple base page size pages and is supported by many
|
||||
architectures. See Documentation/admin-guide/mm/hugetlbpage.rst for more
|
||||
details. On the x86-64 architecture, HugeTLB pages of size 2MB and 1GB are
|
||||
currently supported. Since the base page size on x86 is 4KB, a 2MB HugeTLB page
|
||||
consists of 512 base pages and a 1GB HugeTLB page consists of 4096 base pages.
|
||||
For each base page, there is a corresponding page struct.
|
||||
|
||||
Within the HugeTLB subsystem, only the first 4 page structs are used to
|
||||
contain unique information about a HugeTLB page. __NR_USED_SUBPAGE provides
|
||||
this upper limit. The only 'useful' information in the remaining page structs
|
||||
is the compound_head field, and this field is the same for all tail pages.
|
||||
|
||||
By removing redundant page structs for HugeTLB pages, memory can be returned
|
||||
to the buddy allocator for other uses.
|
||||
|
||||
Different architectures support different HugeTLB pages. For example, the
|
||||
following table is the HugeTLB page size supported by x86 and arm64
|
||||
architectures. Because arm64 supports 4k, 16k, and 64k base pages and
|
||||
supports contiguous entries, so it supports many kinds of sizes of HugeTLB
|
||||
page.
|
||||
|
||||
+--------------+-----------+-----------------------------------------------+
|
||||
| Architecture | Page Size | HugeTLB Page Size |
|
||||
+--------------+-----------+-----------+-----------+-----------+-----------+
|
||||
| x86-64 | 4KB | 2MB | 1GB | | |
|
||||
+--------------+-----------+-----------+-----------+-----------+-----------+
|
||||
| | 4KB | 64KB | 2MB | 32MB | 1GB |
|
||||
| +-----------+-----------+-----------+-----------+-----------+
|
||||
| arm64 | 16KB | 2MB | 32MB | 1GB | |
|
||||
| +-----------+-----------+-----------+-----------+-----------+
|
||||
| | 64KB | 2MB | 512MB | 16GB | |
|
||||
+--------------+-----------+-----------+-----------+-----------+-----------+
|
||||
|
||||
When the system boot up, every HugeTLB page has more than one struct page
|
||||
structs which size is (unit: pages)::
|
||||
|
||||
struct_size = HugeTLB_Size / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
|
||||
|
||||
Where HugeTLB_Size is the size of the HugeTLB page. We know that the size
|
||||
of the HugeTLB page is always n times PAGE_SIZE. So we can get the following
|
||||
relationship::
|
||||
|
||||
HugeTLB_Size = n * PAGE_SIZE
|
||||
|
||||
Then::
|
||||
|
||||
struct_size = n * PAGE_SIZE / PAGE_SIZE * sizeof(struct page) / PAGE_SIZE
|
||||
= n * sizeof(struct page) / PAGE_SIZE
|
||||
|
||||
We can use huge mapping at the pud/pmd level for the HugeTLB page.
|
||||
|
||||
For the HugeTLB page of the pmd level mapping, then::
|
||||
|
||||
struct_size = n * sizeof(struct page) / PAGE_SIZE
|
||||
= PAGE_SIZE / sizeof(pte_t) * sizeof(struct page) / PAGE_SIZE
|
||||
= sizeof(struct page) / sizeof(pte_t)
|
||||
= 64 / 8
|
||||
= 8 (pages)
|
||||
|
||||
Where n is how many pte entries which one page can contains. So the value of
|
||||
n is (PAGE_SIZE / sizeof(pte_t)).
|
||||
|
||||
This optimization only supports 64-bit system, so the value of sizeof(pte_t)
|
||||
is 8. And this optimization also applicable only when the size of struct page
|
||||
is a power of two. In most cases, the size of struct page is 64 bytes (e.g.
|
||||
x86-64 and arm64). So if we use pmd level mapping for a HugeTLB page, the
|
||||
size of struct page structs of it is 8 page frames which size depends on the
|
||||
size of the base page.
|
||||
|
||||
For the HugeTLB page of the pud level mapping, then::
|
||||
|
||||
struct_size = PAGE_SIZE / sizeof(pmd_t) * struct_size(pmd)
|
||||
= PAGE_SIZE / 8 * 8 (pages)
|
||||
= PAGE_SIZE (pages)
|
||||
|
||||
Where the struct_size(pmd) is the size of the struct page structs of a
|
||||
HugeTLB page of the pmd level mapping.
|
||||
|
||||
E.g.: A 2MB HugeTLB page on x86_64 consists in 8 page frames while 1GB
|
||||
HugeTLB page consists in 4096.
|
||||
|
||||
Next, we take the pmd level mapping of the HugeTLB page as an example to
|
||||
show the internal implementation of this optimization. There are 8 pages
|
||||
struct page structs associated with a HugeTLB page which is pmd mapped.
|
||||
|
||||
Here is how things look before optimization::
|
||||
|
||||
HugeTLB struct pages(8 pages) page frame(8 pages)
|
||||
+-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
|
||||
| | | 0 | -------------> | 0 |
|
||||
| | +-----------+ +-----------+
|
||||
| | | 1 | -------------> | 1 |
|
||||
| | +-----------+ +-----------+
|
||||
| | | 2 | -------------> | 2 |
|
||||
| | +-----------+ +-----------+
|
||||
| | | 3 | -------------> | 3 |
|
||||
| | +-----------+ +-----------+
|
||||
| | | 4 | -------------> | 4 |
|
||||
| PMD | +-----------+ +-----------+
|
||||
| level | | 5 | -------------> | 5 |
|
||||
| mapping | +-----------+ +-----------+
|
||||
| | | 6 | -------------> | 6 |
|
||||
| | +-----------+ +-----------+
|
||||
| | | 7 | -------------> | 7 |
|
||||
| | +-----------+ +-----------+
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
+-----------+
|
||||
|
||||
The value of page->compound_head is the same for all tail pages. The first
|
||||
page of page structs (page 0) associated with the HugeTLB page contains the 4
|
||||
page structs necessary to describe the HugeTLB. The only use of the remaining
|
||||
pages of page structs (page 1 to page 7) is to point to page->compound_head.
|
||||
Therefore, we can remap pages 1 to 7 to page 0. Only 1 page of page structs
|
||||
will be used for each HugeTLB page. This will allow us to free the remaining
|
||||
7 pages to the buddy allocator.
|
||||
|
||||
Here is how things look after remapping::
|
||||
|
||||
HugeTLB struct pages(8 pages) page frame(8 pages)
|
||||
+-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
|
||||
| | | 0 | -------------> | 0 |
|
||||
| | +-----------+ +-----------+
|
||||
| | | 1 | ---------------^ ^ ^ ^ ^ ^ ^
|
||||
| | +-----------+ | | | | | |
|
||||
| | | 2 | -----------------+ | | | | |
|
||||
| | +-----------+ | | | | |
|
||||
| | | 3 | -------------------+ | | | |
|
||||
| | +-----------+ | | | |
|
||||
| | | 4 | ---------------------+ | | |
|
||||
| PMD | +-----------+ | | |
|
||||
| level | | 5 | -----------------------+ | |
|
||||
| mapping | +-----------+ | |
|
||||
| | | 6 | -------------------------+ |
|
||||
| | +-----------+ |
|
||||
| | | 7 | ---------------------------+
|
||||
| | +-----------+
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
+-----------+
|
||||
|
||||
When a HugeTLB is freed to the buddy system, we should allocate 7 pages for
|
||||
vmemmap pages and restore the previous mapping relationship.
|
||||
|
||||
For the HugeTLB page of the pud level mapping. It is similar to the former.
|
||||
We also can use this approach to free (PAGE_SIZE - 1) vmemmap pages.
|
||||
|
||||
Apart from the HugeTLB page of the pmd/pud level mapping, some architectures
|
||||
(e.g. aarch64) provides a contiguous bit in the translation table entries
|
||||
that hints to the MMU to indicate that it is one of a contiguous set of
|
||||
entries that can be cached in a single TLB entry.
|
||||
|
||||
The contiguous bit is used to increase the mapping size at the pmd and pte
|
||||
(last) level. So this type of HugeTLB page can be optimized only when its
|
||||
size of the struct page structs is greater than 1 page.
|
||||
|
||||
Notice: The head vmemmap page is not freed to the buddy allocator and all
|
||||
tail vmemmap pages are mapped to the head vmemmap page frame. So we can see
|
||||
more than one struct page struct with PG_head (e.g. 8 per 2 MB HugeTLB page)
|
||||
associated with each HugeTLB page. The compound_head() can handle this
|
||||
correctly (more details refer to the comment above compound_head()).
|
||||
|
||||
Device DAX
|
||||
==========
|
||||
|
||||
The device-dax interface uses the same tail deduplication technique explained
|
||||
in the previous chapter, except when used with the vmemmap in
|
||||
the device (altmap).
|
||||
|
||||
The following page sizes are supported in DAX: PAGE_SIZE (4K on x86_64),
|
||||
PMD_SIZE (2M on x86_64) and PUD_SIZE (1G on x86_64).
|
||||
|
||||
The differences with HugeTLB are relatively minor.
|
||||
|
||||
It only use 3 page structs for storing all information as opposed
|
||||
to 4 on HugeTLB pages.
|
||||
|
||||
There's no remapping of vmemmap given that device-dax memory is not part of
|
||||
System RAM ranges initialized at boot. Thus the tail page deduplication
|
||||
happens at a later stage when we populate the sections. HugeTLB reuses the
|
||||
the head vmemmap page representing, whereas device-dax reuses the tail
|
||||
vmemmap page. This results in only half of the savings compared to HugeTLB.
|
||||
|
||||
Deduplicated tail pages are not mapped read-only.
|
||||
|
||||
Here's how things look like on device-dax after the sections are populated::
|
||||
|
||||
+-----------+ ---virt_to_page---> +-----------+ mapping to +-----------+
|
||||
| | | 0 | -------------> | 0 |
|
||||
| | +-----------+ +-----------+
|
||||
| | | 1 | -------------> | 1 |
|
||||
| | +-----------+ +-----------+
|
||||
| | | 2 | ----------------^ ^ ^ ^ ^ ^
|
||||
| | +-----------+ | | | | |
|
||||
| | | 3 | ------------------+ | | | |
|
||||
| | +-----------+ | | | |
|
||||
| | | 4 | --------------------+ | | |
|
||||
| PMD | +-----------+ | | |
|
||||
| level | | 5 | ----------------------+ | |
|
||||
| mapping | +-----------+ | |
|
||||
| | | 6 | ------------------------+ |
|
||||
| | +-----------+ |
|
||||
| | | 7 | --------------------------+
|
||||
| | +-----------+
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
+-----------+
|
||||
@@ -5027,6 +5027,7 @@ F: Documentation/admin-guide/cgroup-v1/
|
||||
F: Documentation/admin-guide/cgroup-v2.rst
|
||||
F: include/linux/cgroup*
|
||||
F: kernel/cgroup/
|
||||
F: tools/testing/selftests/cgroup/
|
||||
|
||||
CONTROL GROUP - BLOCK IO CONTROLLER (BLKIO)
|
||||
M: Tejun Heo <tj@kernel.org>
|
||||
@@ -5060,6 +5061,8 @@ L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
F: mm/memcontrol.c
|
||||
F: mm/swap_cgroup.c
|
||||
F: tools/testing/selftests/cgroup/test_kmem.c
|
||||
F: tools/testing/selftests/cgroup/test_memcontrol.c
|
||||
|
||||
CORETEMP HARDWARE MONITORING DRIVER
|
||||
M: Fenghua Yu <fenghua.yu@intel.com>
|
||||
@@ -9064,16 +9067,20 @@ S: Orphan
|
||||
F: Documentation/networking/device_drivers/ethernet/huawei/hinic.rst
|
||||
F: drivers/net/ethernet/huawei/hinic/
|
||||
|
||||
HUGETLB FILESYSTEM
|
||||
HUGETLB SUBSYSTEM
|
||||
M: Mike Kravetz <mike.kravetz@oracle.com>
|
||||
M: Muchun Song <songmuchun@bytedance.com>
|
||||
L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
F: Documentation/ABI/testing/sysfs-kernel-mm-hugepages
|
||||
F: Documentation/admin-guide/mm/hugetlbpage.rst
|
||||
F: Documentation/vm/hugetlbfs_reserv.rst
|
||||
F: Documentation/vm/vmemmap_dedup.rst
|
||||
F: fs/hugetlbfs/
|
||||
F: include/linux/hugetlb.h
|
||||
F: mm/hugetlb.c
|
||||
F: mm/hugetlb_vmemmap.c
|
||||
F: mm/hugetlb_vmemmap.h
|
||||
|
||||
HVA ST MEDIA DRIVER
|
||||
M: Jean-Christophe Trotin <jean-christophe.trotin@foss.st.com>
|
||||
|
||||
@@ -18,7 +18,7 @@ extern void clear_page(void *page);
|
||||
#define clear_user_page(page, vaddr, pg) clear_page(page)
|
||||
|
||||
#define alloc_zeroed_user_highpage_movable(vma, vaddr) \
|
||||
alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vmaddr)
|
||||
alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, vma, vaddr)
|
||||
#define __HAVE_ARCH_ALLOC_ZEROED_USER_HIGHPAGE_MOVABLE
|
||||
|
||||
extern void copy_page(void * _to, void * _from);
|
||||
|
||||
@@ -45,6 +45,7 @@ config ARM64
|
||||
select ARCH_HAS_SYSCALL_WRAPPER
|
||||
select ARCH_HAS_TEARDOWN_DMA_OPS if IOMMU_SUPPORT
|
||||
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
|
||||
select ARCH_HAS_VM_GET_PAGE_PROT
|
||||
select ARCH_HAS_ZONE_DMA_SET if EXPERT
|
||||
select ARCH_HAVE_ELF_PROT
|
||||
select ARCH_HAVE_NMI_SAFE_CMPXCHG
|
||||
@@ -91,11 +92,13 @@ config ARM64
|
||||
select ARCH_SUPPORTS_ATOMIC_RMW
|
||||
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
|
||||
select ARCH_SUPPORTS_NUMA_BALANCING
|
||||
select ARCH_SUPPORTS_PAGE_TABLE_CHECK
|
||||
select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT
|
||||
select ARCH_WANT_DEFAULT_BPF_JIT
|
||||
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
|
||||
select ARCH_WANT_FRAME_POINTERS
|
||||
select ARCH_WANT_HUGE_PMD_SHARE if ARM64_4K_PAGES || (ARM64_16K_PAGES && !ARM64_VA_BITS_36)
|
||||
select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||
select ARCH_WANT_LD_ORPHAN_WARN
|
||||
select ARCH_WANTS_NO_INSTR
|
||||
select ARCH_HAS_UBSAN_SANITIZE_ALL
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#define __ASM_CACHE_H
|
||||
|
||||
#include <asm/cputype.h>
|
||||
#include <asm/mte-def.h>
|
||||
|
||||
#define CTR_L1IP_SHIFT 14
|
||||
#define CTR_L1IP_MASK 3
|
||||
@@ -49,15 +50,21 @@
|
||||
*/
|
||||
#define ARCH_DMA_MINALIGN (128)
|
||||
|
||||
#ifdef CONFIG_KASAN_SW_TAGS
|
||||
#define ARCH_SLAB_MINALIGN (1ULL << KASAN_SHADOW_SCALE_SHIFT)
|
||||
#elif defined(CONFIG_KASAN_HW_TAGS)
|
||||
#define ARCH_SLAB_MINALIGN MTE_GRANULE_SIZE
|
||||
#endif
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/kasan-enabled.h>
|
||||
|
||||
#ifdef CONFIG_KASAN_SW_TAGS
|
||||
#define ARCH_SLAB_MINALIGN (1ULL << KASAN_SHADOW_SCALE_SHIFT)
|
||||
#elif defined(CONFIG_KASAN_HW_TAGS)
|
||||
static inline unsigned int arch_slab_minalign(void)
|
||||
{
|
||||
return kasan_hw_tags_enabled() ? MTE_GRANULE_SIZE :
|
||||
__alignof__(unsigned long long);
|
||||
}
|
||||
#define arch_slab_minalign() arch_slab_minalign()
|
||||
#endif
|
||||
|
||||
#define ICACHEF_ALIASING 0
|
||||
#define ICACHEF_VPIPT 1
|
||||
|
||||
@@ -39,8 +39,8 @@ extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
|
||||
extern void huge_ptep_set_wrprotect(struct mm_struct *mm,
|
||||
unsigned long addr, pte_t *ptep);
|
||||
#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
|
||||
extern void huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep);
|
||||
extern pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep);
|
||||
#define __HAVE_ARCH_HUGE_PTE_CLEAR
|
||||
extern void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, unsigned long sz);
|
||||
|
||||
@@ -35,30 +35,6 @@ static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags)
|
||||
}
|
||||
#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags)
|
||||
|
||||
static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
|
||||
{
|
||||
pteval_t prot = 0;
|
||||
|
||||
if (vm_flags & VM_ARM64_BTI)
|
||||
prot |= PTE_GP;
|
||||
|
||||
/*
|
||||
* There are two conditions required for returning a Normal Tagged
|
||||
* memory type: (1) the user requested it via PROT_MTE passed to
|
||||
* mmap() or mprotect() and (2) the corresponding vma supports MTE. We
|
||||
* register (1) as VM_MTE in the vma->vm_flags and (2) as
|
||||
* VM_MTE_ALLOWED. Note that the latter can only be set during the
|
||||
* mmap() call since mprotect() does not accept MAP_* flags.
|
||||
* Checking for VM_MTE only is sufficient since arch_validate_flags()
|
||||
* does not permit (VM_MTE & !VM_MTE_ALLOWED).
|
||||
*/
|
||||
if (vm_flags & VM_MTE)
|
||||
prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED);
|
||||
|
||||
return __pgprot(prot);
|
||||
}
|
||||
#define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags)
|
||||
|
||||
static inline bool arch_validate_prot(unsigned long prot,
|
||||
unsigned long addr __always_unused)
|
||||
{
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#define __ASM_MTE_KASAN_H
|
||||
|
||||
#include <asm/compiler.h>
|
||||
#include <asm/cputype.h>
|
||||
#include <asm/mte-def.h>
|
||||
|
||||
#ifndef __ASSEMBLY__
|
||||
|
||||
@@ -10,6 +10,7 @@
|
||||
#include <asm/alternative.h>
|
||||
#include <asm/cmpxchg.h>
|
||||
#include <asm/stack_pointer.h>
|
||||
#include <asm/sysreg.h>
|
||||
|
||||
static inline void set_my_cpu_offset(unsigned long off)
|
||||
{
|
||||
|
||||
@@ -14,6 +14,7 @@
|
||||
* Software defined PTE bits definition.
|
||||
*/
|
||||
#define PTE_WRITE (PTE_DBM) /* same as DBM (51) */
|
||||
#define PTE_SWP_EXCLUSIVE (_AT(pteval_t, 1) << 2) /* only for swp ptes */
|
||||
#define PTE_DIRTY (_AT(pteval_t, 1) << 55)
|
||||
#define PTE_SPECIAL (_AT(pteval_t, 1) << 56)
|
||||
#define PTE_DEVMAP (_AT(pteval_t, 1) << 57)
|
||||
|
||||
@@ -33,6 +33,7 @@
|
||||
#include <linux/mmdebug.h>
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/page_table_check.h>
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
#define __HAVE_ARCH_FLUSH_PMD_TLB_RANGE
|
||||
@@ -96,6 +97,7 @@ static inline pteval_t __phys_to_pte_val(phys_addr_t phys)
|
||||
#define pte_young(pte) (!!(pte_val(pte) & PTE_AF))
|
||||
#define pte_special(pte) (!!(pte_val(pte) & PTE_SPECIAL))
|
||||
#define pte_write(pte) (!!(pte_val(pte) & PTE_WRITE))
|
||||
#define pte_user(pte) (!!(pte_val(pte) & PTE_USER))
|
||||
#define pte_user_exec(pte) (!(pte_val(pte) & PTE_UXN))
|
||||
#define pte_cont(pte) (!!(pte_val(pte) & PTE_CONT))
|
||||
#define pte_devmap(pte) (!!(pte_val(pte) & PTE_DEVMAP))
|
||||
@@ -312,8 +314,8 @@ static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
|
||||
__func__, pte_val(old_pte), pte_val(pte));
|
||||
}
|
||||
|
||||
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, pte_t pte)
|
||||
static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, pte_t pte)
|
||||
{
|
||||
if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte))
|
||||
__sync_icache_dcache(pte);
|
||||
@@ -343,6 +345,13 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
set_pte(ptep, pte);
|
||||
}
|
||||
|
||||
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, pte_t pte)
|
||||
{
|
||||
page_table_check_pte_set(mm, addr, ptep, pte);
|
||||
return __set_pte_at(mm, addr, ptep, pte);
|
||||
}
|
||||
|
||||
/*
|
||||
* Huge pte definitions.
|
||||
*/
|
||||
@@ -402,6 +411,22 @@ static inline pgprot_t mk_pmd_sect_prot(pgprot_t prot)
|
||||
return __pgprot((pgprot_val(prot) & ~PMD_TABLE_BIT) | PMD_TYPE_SECT);
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE
|
||||
static inline pte_t pte_swp_mkexclusive(pte_t pte)
|
||||
{
|
||||
return set_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
|
||||
}
|
||||
|
||||
static inline int pte_swp_exclusive(pte_t pte)
|
||||
{
|
||||
return pte_val(pte) & PTE_SWP_EXCLUSIVE;
|
||||
}
|
||||
|
||||
static inline pte_t pte_swp_clear_exclusive(pte_t pte)
|
||||
{
|
||||
return clear_pte_bit(pte, __pgprot(PTE_SWP_EXCLUSIVE));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
/*
|
||||
* See the comment in include/linux/pgtable.h
|
||||
@@ -438,6 +463,8 @@ static inline int pmd_trans_huge(pmd_t pmd)
|
||||
#define pmd_dirty(pmd) pte_dirty(pmd_pte(pmd))
|
||||
#define pmd_young(pmd) pte_young(pmd_pte(pmd))
|
||||
#define pmd_valid(pmd) pte_valid(pmd_pte(pmd))
|
||||
#define pmd_user(pmd) pte_user(pmd_pte(pmd))
|
||||
#define pmd_user_exec(pmd) pte_user_exec(pmd_pte(pmd))
|
||||
#define pmd_cont(pmd) pte_cont(pmd_pte(pmd))
|
||||
#define pmd_wrprotect(pmd) pte_pmd(pte_wrprotect(pmd_pte(pmd)))
|
||||
#define pmd_mkold(pmd) pte_pmd(pte_mkold(pmd_pte(pmd)))
|
||||
@@ -485,8 +512,19 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
|
||||
#define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT)
|
||||
#define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
|
||||
|
||||
#define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
|
||||
#define set_pud_at(mm, addr, pudp, pud) set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud))
|
||||
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
|
||||
pmd_t *pmdp, pmd_t pmd)
|
||||
{
|
||||
page_table_check_pmd_set(mm, addr, pmdp, pmd);
|
||||
return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd));
|
||||
}
|
||||
|
||||
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
|
||||
pud_t *pudp, pud_t pud)
|
||||
{
|
||||
page_table_check_pud_set(mm, addr, pudp, pud);
|
||||
return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud));
|
||||
}
|
||||
|
||||
#define __p4d_to_phys(p4d) __pte_to_phys(p4d_pte(p4d))
|
||||
#define __phys_to_p4d_val(phys) __phys_to_pte_val(phys)
|
||||
@@ -627,6 +665,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
|
||||
#define pud_present(pud) pte_present(pud_pte(pud))
|
||||
#define pud_leaf(pud) (pud_present(pud) && !pud_table(pud))
|
||||
#define pud_valid(pud) pte_valid(pud_pte(pud))
|
||||
#define pud_user(pud) pte_user(pud_pte(pud))
|
||||
|
||||
|
||||
static inline void set_pud(pud_t *pudp, pud_t pud)
|
||||
{
|
||||
@@ -799,6 +839,23 @@ static inline int pgd_devmap(pgd_t pgd)
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_CHECK
|
||||
static inline bool pte_user_accessible_page(pte_t pte)
|
||||
{
|
||||
return pte_present(pte) && (pte_user(pte) || pte_user_exec(pte));
|
||||
}
|
||||
|
||||
static inline bool pmd_user_accessible_page(pmd_t pmd)
|
||||
{
|
||||
return pmd_present(pmd) && (pmd_user(pmd) || pmd_user_exec(pmd));
|
||||
}
|
||||
|
||||
static inline bool pud_user_accessible_page(pud_t pud)
|
||||
{
|
||||
return pud_present(pud) && pud_user(pud);
|
||||
}
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Atomic pte/pmd modifications.
|
||||
*/
|
||||
@@ -860,7 +917,11 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
|
||||
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
|
||||
unsigned long address, pte_t *ptep)
|
||||
{
|
||||
return __pte(xchg_relaxed(&pte_val(*ptep), 0));
|
||||
pte_t pte = __pte(xchg_relaxed(&pte_val(*ptep), 0));
|
||||
|
||||
page_table_check_pte_clear(mm, address, pte);
|
||||
|
||||
return pte;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
@@ -868,7 +929,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
|
||||
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
|
||||
unsigned long address, pmd_t *pmdp)
|
||||
{
|
||||
return pte_pmd(ptep_get_and_clear(mm, address, (pte_t *)pmdp));
|
||||
pmd_t pmd = __pmd(xchg_relaxed(&pmd_val(*pmdp), 0));
|
||||
|
||||
page_table_check_pmd_clear(mm, address, pmd);
|
||||
|
||||
return pmd;
|
||||
}
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
@@ -902,6 +967,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
|
||||
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
|
||||
unsigned long address, pmd_t *pmdp, pmd_t pmd)
|
||||
{
|
||||
page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
|
||||
return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd)));
|
||||
}
|
||||
#endif
|
||||
@@ -909,12 +975,13 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
|
||||
/*
|
||||
* Encode and decode a swap entry:
|
||||
* bits 0-1: present (must be zero)
|
||||
* bits 2-7: swap type
|
||||
* bits 2: remember PG_anon_exclusive
|
||||
* bits 3-7: swap type
|
||||
* bits 8-57: swap offset
|
||||
* bit 58: PTE_PROT_NONE (must be zero)
|
||||
*/
|
||||
#define __SWP_TYPE_SHIFT 2
|
||||
#define __SWP_TYPE_BITS 6
|
||||
#define __SWP_TYPE_SHIFT 3
|
||||
#define __SWP_TYPE_BITS 5
|
||||
#define __SWP_OFFSET_BITS 50
|
||||
#define __SWP_TYPE_MASK ((1 << __SWP_TYPE_BITS) - 1)
|
||||
#define __SWP_OFFSET_SHIFT (__SWP_TYPE_BITS + __SWP_TYPE_SHIFT)
|
||||
@@ -964,10 +1031,10 @@ static inline void arch_swap_invalidate_area(int type)
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_SWAP_RESTORE
|
||||
static inline void arch_swap_restore(swp_entry_t entry, struct page *page)
|
||||
static inline void arch_swap_restore(swp_entry_t entry, struct folio *folio)
|
||||
{
|
||||
if (system_supports_mte() && mte_restore_tags(entry, page))
|
||||
set_bit(PG_mte_tagged, &page->flags);
|
||||
if (system_supports_mte() && mte_restore_tags(entry, &folio->page))
|
||||
set_bit(PG_mte_tagged, &folio->flags);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_ARM64_MTE */
|
||||
|
||||
@@ -75,6 +75,20 @@ EXPORT_SYMBOL_GPL(__sync_icache_dcache);
|
||||
*/
|
||||
void flush_dcache_page(struct page *page)
|
||||
{
|
||||
/*
|
||||
* Only the head page's flags of HugeTLB can be cleared since the tail
|
||||
* vmemmap pages associated with each HugeTLB page are mapped with
|
||||
* read-only when CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP is enabled (more
|
||||
* details can refer to vmemmap_remap_pte()). Although
|
||||
* __sync_icache_dcache() only set PG_dcache_clean flag on the head
|
||||
* page struct, there is more than one page struct with PG_dcache_clean
|
||||
* associated with the HugeTLB page since the head vmemmap page frame
|
||||
* is reused (more details can refer to the comments above
|
||||
* page_fixed_fake_head()).
|
||||
*/
|
||||
if (hugetlb_optimize_vmemmap_enabled() && PageHuge(page))
|
||||
page = compound_head(page);
|
||||
|
||||
if (test_bit(PG_dcache_clean, &page->flags))
|
||||
clear_bit(PG_dcache_clean, &page->flags);
|
||||
}
|
||||
|
||||
@@ -502,19 +502,17 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm,
|
||||
set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot));
|
||||
}
|
||||
|
||||
void huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
{
|
||||
size_t pgsize;
|
||||
int ncontig;
|
||||
|
||||
if (!pte_cont(READ_ONCE(*ptep))) {
|
||||
ptep_clear_flush(vma, addr, ptep);
|
||||
return;
|
||||
}
|
||||
if (!pte_cont(READ_ONCE(*ptep)))
|
||||
return ptep_clear_flush(vma, addr, ptep);
|
||||
|
||||
ncontig = find_num_contig(vma->vm_mm, addr, ptep, &pgsize);
|
||||
clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig);
|
||||
return get_clear_flush(vma->vm_mm, addr, ptep, pgsize, ncontig);
|
||||
}
|
||||
|
||||
static int __init hugetlbpage_init(void)
|
||||
|
||||
@@ -55,3 +55,28 @@ static int __init adjust_protection_map(void)
|
||||
return 0;
|
||||
}
|
||||
arch_initcall(adjust_protection_map);
|
||||
|
||||
pgprot_t vm_get_page_prot(unsigned long vm_flags)
|
||||
{
|
||||
pteval_t prot = pgprot_val(protection_map[vm_flags &
|
||||
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
|
||||
|
||||
if (vm_flags & VM_ARM64_BTI)
|
||||
prot |= PTE_GP;
|
||||
|
||||
/*
|
||||
* There are two conditions required for returning a Normal Tagged
|
||||
* memory type: (1) the user requested it via PROT_MTE passed to
|
||||
* mmap() or mprotect() and (2) the corresponding vma supports MTE. We
|
||||
* register (1) as VM_MTE in the vma->vm_flags and (2) as
|
||||
* VM_MTE_ALLOWED. Note that the latter can only be set during the
|
||||
* mmap() call since mprotect() does not accept MAP_* flags.
|
||||
* Checking for VM_MTE only is sufficient since arch_validate_flags()
|
||||
* does not permit (VM_MTE & !VM_MTE_ALLOWED).
|
||||
*/
|
||||
if (vm_flags & VM_MTE)
|
||||
prot |= PTE_ATTRINDX(MT_NORMAL_TAGGED);
|
||||
|
||||
return __pgprot(prot);
|
||||
}
|
||||
EXPORT_SYMBOL(vm_get_page_prot);
|
||||
|
||||
@@ -4,9 +4,9 @@
|
||||
#define __ASM_CSKY_PROCESSOR_H
|
||||
|
||||
#include <linux/bitops.h>
|
||||
#include <linux/cache.h>
|
||||
#include <asm/ptrace.h>
|
||||
#include <asm/current.h>
|
||||
#include <asm/cache.h>
|
||||
#include <abi/reg_ops.h>
|
||||
#include <abi/regdef.h>
|
||||
#include <abi/switch_context.h>
|
||||
|
||||
@@ -23,9 +23,10 @@ static inline int is_hugepage_only_range(struct mm_struct *mm,
|
||||
#define is_hugepage_only_range is_hugepage_only_range
|
||||
|
||||
#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
|
||||
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
{
|
||||
return *ptep;
|
||||
}
|
||||
|
||||
#include <asm-generic/hugetlb.h>
|
||||
|
||||
@@ -261,7 +261,7 @@ static int __init uncached_init(void)
|
||||
{
|
||||
int nid;
|
||||
|
||||
for_each_node_state(nid, N_ONLINE) {
|
||||
for_each_online_node(nid) {
|
||||
uncached_pools[nid].pool = gen_pool_create(PAGE_SHIFT, nid);
|
||||
mutex_init(&uncached_pools[nid].add_chunk_mutex);
|
||||
}
|
||||
|
||||
@@ -43,16 +43,19 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
|
||||
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
{
|
||||
pte_t pte;
|
||||
|
||||
/*
|
||||
* clear the huge pte entry firstly, so that the other smp threads will
|
||||
* not get old pte entry after finishing flush_tlb_page and before
|
||||
* setting new huge pte entry
|
||||
*/
|
||||
huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
|
||||
pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
|
||||
flush_tlb_page(vma, addr);
|
||||
return pte;
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_HUGE_PTE_NONE
|
||||
|
||||
@@ -28,9 +28,10 @@ static inline int prepare_hugepage_range(struct file *file,
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
|
||||
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
{
|
||||
return *ptep;
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
|
||||
|
||||
@@ -140,6 +140,7 @@ config PPC
|
||||
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
|
||||
select ARCH_HAS_UACCESS_FLUSHCACHE
|
||||
select ARCH_HAS_UBSAN_SANITIZE_ALL
|
||||
select ARCH_HAS_VM_GET_PAGE_PROT if PPC_BOOK3S_64
|
||||
select ARCH_HAVE_NMI_SAFE_CMPXCHG
|
||||
select ARCH_KEEP_MEMBLOCK
|
||||
select ARCH_MIGHT_HAVE_PC_PARPORT
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
/*
|
||||
* Common bits between hash and Radix page table
|
||||
*/
|
||||
#define _PAGE_BIT_SWAP_TYPE 0
|
||||
|
||||
#define _PAGE_EXEC 0x00001 /* execute permission */
|
||||
#define _PAGE_WRITE 0x00002 /* write access allowed */
|
||||
@@ -751,17 +750,17 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
|
||||
* Don't have overlapping bits with _PAGE_HPTEFLAGS \
|
||||
* We filter HPTEFLAGS on set_pte. \
|
||||
*/ \
|
||||
BUILD_BUG_ON(_PAGE_HPTEFLAGS & (0x1f << _PAGE_BIT_SWAP_TYPE)); \
|
||||
BUILD_BUG_ON(_PAGE_HPTEFLAGS & SWP_TYPE_MASK); \
|
||||
BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_SOFT_DIRTY); \
|
||||
BUILD_BUG_ON(_PAGE_HPTEFLAGS & _PAGE_SWP_EXCLUSIVE); \
|
||||
} while (0)
|
||||
|
||||
#define SWP_TYPE_BITS 5
|
||||
#define __swp_type(x) (((x).val >> _PAGE_BIT_SWAP_TYPE) \
|
||||
& ((1UL << SWP_TYPE_BITS) - 1))
|
||||
#define SWP_TYPE_MASK ((1UL << SWP_TYPE_BITS) - 1)
|
||||
#define __swp_type(x) ((x).val & SWP_TYPE_MASK)
|
||||
#define __swp_offset(x) (((x).val & PTE_RPN_MASK) >> PAGE_SHIFT)
|
||||
#define __swp_entry(type, offset) ((swp_entry_t) { \
|
||||
((type) << _PAGE_BIT_SWAP_TYPE) \
|
||||
| (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)})
|
||||
(type) | (((offset) << PAGE_SHIFT) & PTE_RPN_MASK)})
|
||||
/*
|
||||
* swp_entry_t must be independent of pte bits. We build a swp_entry_t from
|
||||
* swap type and offset we get from swap and convert that to pte to find a
|
||||
@@ -774,11 +773,13 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
|
||||
#define __swp_entry_to_pmd(x) (pte_pmd(__swp_entry_to_pte(x)))
|
||||
|
||||
#ifdef CONFIG_MEM_SOFT_DIRTY
|
||||
#define _PAGE_SWP_SOFT_DIRTY (1UL << (SWP_TYPE_BITS + _PAGE_BIT_SWAP_TYPE))
|
||||
#define _PAGE_SWP_SOFT_DIRTY _PAGE_SOFT_DIRTY
|
||||
#else
|
||||
#define _PAGE_SWP_SOFT_DIRTY 0UL
|
||||
#endif /* CONFIG_MEM_SOFT_DIRTY */
|
||||
|
||||
#define _PAGE_SWP_EXCLUSIVE _PAGE_NON_IDEMPOTENT
|
||||
|
||||
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
|
||||
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
|
||||
{
|
||||
@@ -796,6 +797,22 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
|
||||
}
|
||||
#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
|
||||
|
||||
#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE
|
||||
static inline pte_t pte_swp_mkexclusive(pte_t pte)
|
||||
{
|
||||
return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SWP_EXCLUSIVE));
|
||||
}
|
||||
|
||||
static inline int pte_swp_exclusive(pte_t pte)
|
||||
{
|
||||
return !!(pte_raw(pte) & cpu_to_be64(_PAGE_SWP_EXCLUSIVE));
|
||||
}
|
||||
|
||||
static inline pte_t pte_swp_clear_exclusive(pte_t pte)
|
||||
{
|
||||
return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_SWP_EXCLUSIVE));
|
||||
}
|
||||
|
||||
static inline bool check_pte_access(unsigned long access, unsigned long ptev)
|
||||
{
|
||||
/*
|
||||
|
||||
@@ -43,11 +43,14 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
|
||||
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
{
|
||||
huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
|
||||
pte_t pte;
|
||||
|
||||
pte = huge_ptep_get_and_clear(vma->vm_mm, addr, ptep);
|
||||
flush_hugetlb_page(vma, addr);
|
||||
return pte;
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS
|
||||
|
||||
@@ -24,18 +24,6 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
|
||||
}
|
||||
#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
|
||||
|
||||
static inline pgprot_t arch_vm_get_page_prot(unsigned long vm_flags)
|
||||
{
|
||||
#ifdef CONFIG_PPC_MEM_KEYS
|
||||
return (vm_flags & VM_SAO) ?
|
||||
__pgprot(_PAGE_SAO | vmflag_to_pte_pkey_bits(vm_flags)) :
|
||||
__pgprot(0 | vmflag_to_pte_pkey_bits(vm_flags));
|
||||
#else
|
||||
return (vm_flags & VM_SAO) ? __pgprot(_PAGE_SAO) : __pgprot(0);
|
||||
#endif
|
||||
}
|
||||
#define arch_vm_get_page_prot(vm_flags) arch_vm_get_page_prot(vm_flags)
|
||||
|
||||
static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
|
||||
{
|
||||
if (prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM | PROT_SAO))
|
||||
|
||||
@@ -7,6 +7,7 @@
|
||||
#include <linux/mm_types.h>
|
||||
#include <linux/memblock.h>
|
||||
#include <linux/memremap.h>
|
||||
#include <linux/pkeys.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <misc/cxl-base.h>
|
||||
|
||||
@@ -549,3 +550,19 @@ unsigned long memremap_compat_align(void)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(memremap_compat_align);
|
||||
#endif
|
||||
|
||||
pgprot_t vm_get_page_prot(unsigned long vm_flags)
|
||||
{
|
||||
unsigned long prot = pgprot_val(protection_map[vm_flags &
|
||||
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
|
||||
|
||||
if (vm_flags & VM_SAO)
|
||||
prot |= _PAGE_SAO;
|
||||
|
||||
#ifdef CONFIG_PPC_MEM_KEYS
|
||||
prot |= vmflag_to_pte_pkey_bits(vm_flags);
|
||||
#endif
|
||||
|
||||
return __pgprot(prot);
|
||||
}
|
||||
EXPORT_SYMBOL(vm_get_page_prot);
|
||||
|
||||
@@ -38,6 +38,7 @@ config RISCV
|
||||
select ARCH_SUPPORTS_ATOMIC_RMW
|
||||
select ARCH_SUPPORTS_DEBUG_PAGEALLOC if MMU
|
||||
select ARCH_SUPPORTS_HUGETLBFS if MMU
|
||||
select ARCH_SUPPORTS_PAGE_TABLE_CHECK
|
||||
select ARCH_USE_MEMTEST
|
||||
select ARCH_USE_QUEUED_RWLOCKS
|
||||
select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU
|
||||
|
||||
@@ -86,6 +86,11 @@ static inline int pud_leaf(pud_t pud)
|
||||
return pud_present(pud) && (pud_val(pud) & _PAGE_LEAF);
|
||||
}
|
||||
|
||||
static inline int pud_user(pud_t pud)
|
||||
{
|
||||
return pud_val(pud) & _PAGE_USER;
|
||||
}
|
||||
|
||||
static inline void set_pud(pud_t *pudp, pud_t pud)
|
||||
{
|
||||
*pudp = pud;
|
||||
|
||||
@@ -114,6 +114,8 @@
|
||||
#include <asm/pgtable-32.h>
|
||||
#endif /* CONFIG_64BIT */
|
||||
|
||||
#include <linux/page_table_check.h>
|
||||
|
||||
#ifdef CONFIG_XIP_KERNEL
|
||||
#define XIP_FIXUP(addr) ({ \
|
||||
uintptr_t __a = (uintptr_t)(addr); \
|
||||
@@ -315,6 +317,11 @@ static inline int pte_exec(pte_t pte)
|
||||
return pte_val(pte) & _PAGE_EXEC;
|
||||
}
|
||||
|
||||
static inline int pte_user(pte_t pte)
|
||||
{
|
||||
return pte_val(pte) & _PAGE_USER;
|
||||
}
|
||||
|
||||
static inline int pte_huge(pte_t pte)
|
||||
{
|
||||
return pte_present(pte) && (pte_val(pte) & _PAGE_LEAF);
|
||||
@@ -446,7 +453,7 @@ static inline void set_pte(pte_t *ptep, pte_t pteval)
|
||||
|
||||
void flush_icache_pte(pte_t pte);
|
||||
|
||||
static inline void set_pte_at(struct mm_struct *mm,
|
||||
static inline void __set_pte_at(struct mm_struct *mm,
|
||||
unsigned long addr, pte_t *ptep, pte_t pteval)
|
||||
{
|
||||
if (pte_present(pteval) && pte_exec(pteval))
|
||||
@@ -455,10 +462,17 @@ static inline void set_pte_at(struct mm_struct *mm,
|
||||
set_pte(ptep, pteval);
|
||||
}
|
||||
|
||||
static inline void set_pte_at(struct mm_struct *mm,
|
||||
unsigned long addr, pte_t *ptep, pte_t pteval)
|
||||
{
|
||||
page_table_check_pte_set(mm, addr, ptep, pteval);
|
||||
__set_pte_at(mm, addr, ptep, pteval);
|
||||
}
|
||||
|
||||
static inline void pte_clear(struct mm_struct *mm,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
{
|
||||
set_pte_at(mm, addr, ptep, __pte(0));
|
||||
__set_pte_at(mm, addr, ptep, __pte(0));
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
|
||||
@@ -479,7 +493,11 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
|
||||
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
|
||||
unsigned long address, pte_t *ptep)
|
||||
{
|
||||
return __pte(atomic_long_xchg((atomic_long_t *)ptep, 0));
|
||||
pte_t pte = __pte(atomic_long_xchg((atomic_long_t *)ptep, 0));
|
||||
|
||||
page_table_check_pte_clear(mm, address, pte);
|
||||
|
||||
return pte;
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG
|
||||
@@ -546,6 +564,13 @@ static inline unsigned long pmd_pfn(pmd_t pmd)
|
||||
return ((__pmd_to_phys(pmd) & PMD_MASK) >> PAGE_SHIFT);
|
||||
}
|
||||
|
||||
#define __pud_to_phys(pud) (pud_val(pud) >> _PAGE_PFN_SHIFT << PAGE_SHIFT)
|
||||
|
||||
static inline unsigned long pud_pfn(pud_t pud)
|
||||
{
|
||||
return ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT);
|
||||
}
|
||||
|
||||
static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
|
||||
{
|
||||
return pte_pmd(pte_modify(pmd_pte(pmd), newprot));
|
||||
@@ -567,6 +592,11 @@ static inline int pmd_young(pmd_t pmd)
|
||||
return pte_young(pmd_pte(pmd));
|
||||
}
|
||||
|
||||
static inline int pmd_user(pmd_t pmd)
|
||||
{
|
||||
return pte_user(pmd_pte(pmd));
|
||||
}
|
||||
|
||||
static inline pmd_t pmd_mkold(pmd_t pmd)
|
||||
{
|
||||
return pte_pmd(pte_mkold(pmd_pte(pmd)));
|
||||
@@ -600,15 +630,34 @@ static inline pmd_t pmd_mkdirty(pmd_t pmd)
|
||||
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
|
||||
pmd_t *pmdp, pmd_t pmd)
|
||||
{
|
||||
return set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd));
|
||||
page_table_check_pmd_set(mm, addr, pmdp, pmd);
|
||||
return __set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd));
|
||||
}
|
||||
|
||||
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
|
||||
pud_t *pudp, pud_t pud)
|
||||
{
|
||||
return set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud));
|
||||
page_table_check_pud_set(mm, addr, pudp, pud);
|
||||
return __set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud));
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_CHECK
|
||||
static inline bool pte_user_accessible_page(pte_t pte)
|
||||
{
|
||||
return pte_present(pte) && pte_user(pte);
|
||||
}
|
||||
|
||||
static inline bool pmd_user_accessible_page(pmd_t pmd)
|
||||
{
|
||||
return pmd_leaf(pmd) && pmd_user(pmd);
|
||||
}
|
||||
|
||||
static inline bool pud_user_accessible_page(pud_t pud)
|
||||
{
|
||||
return pud_leaf(pud) && pud_user(pud);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
static inline int pmd_trans_huge(pmd_t pmd)
|
||||
{
|
||||
@@ -634,7 +683,11 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
|
||||
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm,
|
||||
unsigned long address, pmd_t *pmdp)
|
||||
{
|
||||
return pte_pmd(ptep_get_and_clear(mm, address, (pte_t *)pmdp));
|
||||
pmd_t pmd = __pmd(atomic_long_xchg((atomic_long_t *)pmdp, 0));
|
||||
|
||||
page_table_check_pmd_clear(mm, address, pmd);
|
||||
|
||||
return pmd;
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
|
||||
@@ -648,6 +701,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
|
||||
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
|
||||
unsigned long address, pmd_t *pmdp, pmd_t pmd)
|
||||
{
|
||||
page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
|
||||
return __pmd(atomic_long_xchg((atomic_long_t *)pmdp, pmd_val(pmd)));
|
||||
}
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
|
||||
@@ -50,10 +50,10 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
|
||||
set_pte(ptep, __pte(_SEGMENT_ENTRY_EMPTY));
|
||||
}
|
||||
|
||||
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long address, pte_t *ptep)
|
||||
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long address, pte_t *ptep)
|
||||
{
|
||||
huge_ptep_get_and_clear(vma->vm_mm, address, ptep);
|
||||
return huge_ptep_get_and_clear(vma->vm_mm, address, ptep);
|
||||
}
|
||||
|
||||
static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma,
|
||||
@@ -85,6 +85,11 @@ static inline int huge_pte_none(pte_t pte)
|
||||
return pte_none(pte);
|
||||
}
|
||||
|
||||
static inline int huge_pte_none_mostly(pte_t pte)
|
||||
{
|
||||
return huge_pte_none(pte);
|
||||
}
|
||||
|
||||
static inline int huge_pte_write(pte_t pte)
|
||||
{
|
||||
return pte_write(pte);
|
||||
@@ -115,6 +120,21 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
|
||||
return pte_modify(pte, newprot);
|
||||
}
|
||||
|
||||
static inline pte_t huge_pte_mkuffd_wp(pte_t pte)
|
||||
{
|
||||
return pte;
|
||||
}
|
||||
|
||||
static inline pte_t huge_pte_clear_uffd_wp(pte_t pte)
|
||||
{
|
||||
return pte;
|
||||
}
|
||||
|
||||
static inline int huge_pte_uffd_wp(pte_t pte)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool gigantic_page_runtime_supported(void)
|
||||
{
|
||||
return true;
|
||||
|
||||
@@ -181,6 +181,8 @@ static inline int is_module_addr(void *addr)
|
||||
#define _PAGE_SOFT_DIRTY 0x000
|
||||
#endif
|
||||
|
||||
#define _PAGE_SWP_EXCLUSIVE _PAGE_LARGE /* SW pte exclusive swap bit */
|
||||
|
||||
/* Set of bits not changed in pte_modify */
|
||||
#define _PAGE_CHG_MASK (PAGE_MASK | _PAGE_SPECIAL | _PAGE_DIRTY | \
|
||||
_PAGE_YOUNG | _PAGE_SOFT_DIRTY)
|
||||
@@ -826,6 +828,22 @@ static inline int pmd_protnone(pmd_t pmd)
|
||||
}
|
||||
#endif
|
||||
|
||||
#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE
|
||||
static inline int pte_swp_exclusive(pte_t pte)
|
||||
{
|
||||
return pte_val(pte) & _PAGE_SWP_EXCLUSIVE;
|
||||
}
|
||||
|
||||
static inline pte_t pte_swp_mkexclusive(pte_t pte)
|
||||
{
|
||||
return set_pte_bit(pte, __pgprot(_PAGE_SWP_EXCLUSIVE));
|
||||
}
|
||||
|
||||
static inline pte_t pte_swp_clear_exclusive(pte_t pte)
|
||||
{
|
||||
return clear_pte_bit(pte, __pgprot(_PAGE_SWP_EXCLUSIVE));
|
||||
}
|
||||
|
||||
static inline int pte_soft_dirty(pte_t pte)
|
||||
{
|
||||
return pte_val(pte) & _PAGE_SOFT_DIRTY;
|
||||
@@ -1712,18 +1730,18 @@ static inline int has_transparent_hugepage(void)
|
||||
/*
|
||||
* 64 bit swap entry format:
|
||||
* A page-table entry has some bits we have to treat in a special way.
|
||||
* Bits 52 and bit 55 have to be zero, otherwise a specification
|
||||
* exception will occur instead of a page translation exception. The
|
||||
* specification exception has the bad habit not to store necessary
|
||||
* information in the lowcore.
|
||||
* Bits 54 and 63 are used to indicate the page type.
|
||||
* Bits 54 and 63 are used to indicate the page type. Bit 53 marks the pte
|
||||
* as invalid.
|
||||
* A swap pte is indicated by bit pattern (pte & 0x201) == 0x200
|
||||
* This leaves the bits 0-51 and bits 56-62 to store type and offset.
|
||||
* We use the 5 bits from 57-61 for the type and the 52 bits from 0-51
|
||||
* for the offset.
|
||||
* | offset |01100|type |00|
|
||||
* | offset |E11XX|type |S0|
|
||||
* |0000000000111111111122222222223333333333444444444455|55555|55566|66|
|
||||
* |0123456789012345678901234567890123456789012345678901|23456|78901|23|
|
||||
*
|
||||
* Bits 0-51 store the offset.
|
||||
* Bit 52 (E) is used to remember PG_anon_exclusive.
|
||||
* Bits 57-61 store the type.
|
||||
* Bit 62 (S) is used for softdirty tracking.
|
||||
* Bits 55 and 56 (X) are unused.
|
||||
*/
|
||||
|
||||
#define __SWP_OFFSET_MASK ((1UL << 52) - 1)
|
||||
|
||||
@@ -21,9 +21,10 @@ static inline int prepare_hugepage_range(struct file *file,
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
|
||||
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
{
|
||||
return *ptep;
|
||||
}
|
||||
|
||||
static inline void arch_clear_hugepage_flags(struct page *page)
|
||||
|
||||
@@ -84,6 +84,7 @@ config SPARC64
|
||||
select PERF_USE_VMALLOC
|
||||
select ARCH_HAVE_NMI_SAFE_CMPXCHG
|
||||
select HAVE_C_RECORDMCOUNT
|
||||
select ARCH_HAS_VM_GET_PAGE_PROT
|
||||
select HAVE_ARCH_AUDITSYSCALL
|
||||
select ARCH_SUPPORTS_ATOMIC_RMW
|
||||
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
|
||||
|
||||
@@ -21,9 +21,10 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep);
|
||||
|
||||
#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
|
||||
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
{
|
||||
return *ptep;
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT
|
||||
|
||||
@@ -46,12 +46,6 @@ static inline unsigned long sparc_calc_vm_prot_bits(unsigned long prot)
|
||||
}
|
||||
}
|
||||
|
||||
#define arch_vm_get_page_prot(vm_flags) sparc_vm_get_page_prot(vm_flags)
|
||||
static inline pgprot_t sparc_vm_get_page_prot(unsigned long vm_flags)
|
||||
{
|
||||
return (vm_flags & VM_SPARC_ADI) ? __pgprot(_PAGE_MCD_4V) : __pgprot(0);
|
||||
}
|
||||
|
||||
#define arch_validate_prot(prot, addr) sparc_validate_prot(prot, addr)
|
||||
static inline int sparc_validate_prot(unsigned long prot, unsigned long addr)
|
||||
{
|
||||
|
||||
@@ -3184,3 +3184,15 @@ void copy_highpage(struct page *to, struct page *from)
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL(copy_highpage);
|
||||
|
||||
pgprot_t vm_get_page_prot(unsigned long vm_flags)
|
||||
{
|
||||
unsigned long prot = pgprot_val(protection_map[vm_flags &
|
||||
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
|
||||
|
||||
if (vm_flags & VM_SPARC_ADI)
|
||||
prot |= _PAGE_MCD_4V;
|
||||
|
||||
return __pgprot(prot);
|
||||
}
|
||||
EXPORT_SYMBOL(vm_get_page_prot);
|
||||
|
||||
@@ -76,7 +76,6 @@ config X86
|
||||
select ARCH_HAS_EARLY_DEBUG if KGDB
|
||||
select ARCH_HAS_ELF_RANDOMIZE
|
||||
select ARCH_HAS_FAST_MULTIPLIER
|
||||
select ARCH_HAS_FILTER_PGPROT
|
||||
select ARCH_HAS_FORTIFY_SOURCE
|
||||
select ARCH_HAS_GCOV_PROFILE_ALL
|
||||
select ARCH_HAS_KCOV if X86_64
|
||||
@@ -95,6 +94,7 @@ config X86
|
||||
select ARCH_HAS_SYNC_CORE_BEFORE_USERMODE
|
||||
select ARCH_HAS_SYSCALL_WRAPPER
|
||||
select ARCH_HAS_UBSAN_SANITIZE_ALL
|
||||
select ARCH_HAS_VM_GET_PAGE_PROT
|
||||
select ARCH_HAS_DEBUG_WX
|
||||
select ARCH_HAS_ZONE_DMA_SET if EXPERT
|
||||
select ARCH_HAVE_NMI_SAFE_CMPXCHG
|
||||
@@ -121,6 +121,7 @@ config X86
|
||||
select ARCH_WANTS_NO_INSTR
|
||||
select ARCH_WANT_GENERAL_HUGETLB
|
||||
select ARCH_WANT_HUGE_PMD_SHARE
|
||||
select ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP if X86_64
|
||||
select ARCH_WANT_LD_ORPHAN_WARN
|
||||
select ARCH_WANTS_THP_SWAP if X86_64
|
||||
select ARCH_HAS_PARANOID_L1D_FLUSH
|
||||
|
||||
@@ -649,11 +649,6 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
|
||||
|
||||
#define canon_pgprot(p) __pgprot(massage_pgprot(p))
|
||||
|
||||
static inline pgprot_t arch_filter_pgprot(pgprot_t prot)
|
||||
{
|
||||
return canon_pgprot(prot);
|
||||
}
|
||||
|
||||
static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
|
||||
enum page_cache_mode pcm,
|
||||
enum page_cache_mode new_pcm)
|
||||
@@ -1077,16 +1072,6 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
|
||||
return pte;
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PTEP_CLEAR
|
||||
static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK))
|
||||
ptep_get_and_clear(mm, addr, ptep);
|
||||
else
|
||||
pte_clear(mm, addr, ptep);
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
|
||||
static inline void ptep_set_wrprotect(struct mm_struct *mm,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
@@ -1173,6 +1158,11 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#define __HAVE_ARCH_PMDP_INVALIDATE_AD
|
||||
extern pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma,
|
||||
unsigned long address, pmd_t *pmdp);
|
||||
|
||||
/*
|
||||
* Page table pages are page-aligned. The lower half of the top
|
||||
* level is used for userspace and the top half for the kernel.
|
||||
@@ -1291,6 +1281,23 @@ static inline void update_mmu_cache_pud(struct vm_area_struct *vma,
|
||||
unsigned long addr, pud_t *pud)
|
||||
{
|
||||
}
|
||||
#ifdef _PAGE_SWP_EXCLUSIVE
|
||||
#define __HAVE_ARCH_PTE_SWP_EXCLUSIVE
|
||||
static inline pte_t pte_swp_mkexclusive(pte_t pte)
|
||||
{
|
||||
return pte_set_flags(pte, _PAGE_SWP_EXCLUSIVE);
|
||||
}
|
||||
|
||||
static inline int pte_swp_exclusive(pte_t pte)
|
||||
{
|
||||
return pte_flags(pte) & _PAGE_SWP_EXCLUSIVE;
|
||||
}
|
||||
|
||||
static inline pte_t pte_swp_clear_exclusive(pte_t pte)
|
||||
{
|
||||
return pte_clear_flags(pte, _PAGE_SWP_EXCLUSIVE);
|
||||
}
|
||||
#endif /* _PAGE_SWP_EXCLUSIVE */
|
||||
|
||||
#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY
|
||||
static inline pte_t pte_swp_mksoft_dirty(pte_t pte)
|
||||
@@ -1430,6 +1437,23 @@ static inline bool arch_faults_on_old_pte(void)
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_CHECK
|
||||
static inline bool pte_user_accessible_page(pte_t pte)
|
||||
{
|
||||
return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER);
|
||||
}
|
||||
|
||||
static inline bool pmd_user_accessible_page(pmd_t pmd)
|
||||
{
|
||||
return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && (pmd_val(pmd) & _PAGE_USER);
|
||||
}
|
||||
|
||||
static inline bool pud_user_accessible_page(pud_t pud)
|
||||
{
|
||||
return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && (pud_val(pud) & _PAGE_USER);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* __ASSEMBLY__ */
|
||||
|
||||
#endif /* _ASM_X86_PGTABLE_H */
|
||||
|
||||
@@ -186,7 +186,7 @@ static inline void native_pgd_clear(pgd_t *pgd)
|
||||
*
|
||||
* | ... | 11| 10| 9|8|7|6|5| 4| 3|2| 1|0| <- bit number
|
||||
* | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U| W|P| <- bit names
|
||||
* | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| X|F|SD|0| <- swp entry
|
||||
* | TYPE (59-63) | ~OFFSET (9-58) |0|0|X|X| X| E|F|SD|0| <- swp entry
|
||||
*
|
||||
* G (8) is aliased and used as a PROT_NONE indicator for
|
||||
* !present ptes. We need to start storing swap entries above
|
||||
@@ -203,6 +203,8 @@ static inline void native_pgd_clear(pgd_t *pgd)
|
||||
* F (2) in swp entry is used to record when a pagetable is
|
||||
* writeprotected by userfaultfd WP support.
|
||||
*
|
||||
* E (3) in swp entry is used to rememeber PG_anon_exclusive.
|
||||
*
|
||||
* Bit 7 in swp entry should be 0 because pmd_present checks not only P,
|
||||
* but also L and G.
|
||||
*
|
||||
|
||||
@@ -163,4 +163,9 @@ extern unsigned int ptrs_per_p4d;
|
||||
|
||||
#define PGD_KERNEL_START ((PAGE_SIZE / 2) / sizeof(pgd_t))
|
||||
|
||||
/*
|
||||
* We borrow bit 3 to remember PG_anon_exclusive.
|
||||
*/
|
||||
#define _PAGE_SWP_EXCLUSIVE _PAGE_PWT
|
||||
|
||||
#endif /* _ASM_X86_PGTABLE_64_DEFS_H */
|
||||
|
||||
@@ -110,9 +110,11 @@
|
||||
#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
|
||||
#define _PAGE_NX (_AT(pteval_t, 1) << _PAGE_BIT_NX)
|
||||
#define _PAGE_DEVMAP (_AT(u64, 1) << _PAGE_BIT_DEVMAP)
|
||||
#define _PAGE_SOFTW4 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW4)
|
||||
#else
|
||||
#define _PAGE_NX (_AT(pteval_t, 0))
|
||||
#define _PAGE_DEVMAP (_AT(pteval_t, 0))
|
||||
#define _PAGE_SOFTW4 (_AT(pteval_t, 0))
|
||||
#endif
|
||||
|
||||
#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
|
||||
|
||||
@@ -259,6 +259,103 @@ static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
|
||||
|
||||
extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
|
||||
|
||||
static inline bool pte_flags_need_flush(unsigned long oldflags,
|
||||
unsigned long newflags,
|
||||
bool ignore_access)
|
||||
{
|
||||
/*
|
||||
* Flags that require a flush when cleared but not when they are set.
|
||||
* Only include flags that would not trigger spurious page-faults.
|
||||
* Non-present entries are not cached. Hardware would set the
|
||||
* dirty/access bit if needed without a fault.
|
||||
*/
|
||||
const pteval_t flush_on_clear = _PAGE_DIRTY | _PAGE_PRESENT |
|
||||
_PAGE_ACCESSED;
|
||||
const pteval_t software_flags = _PAGE_SOFTW1 | _PAGE_SOFTW2 |
|
||||
_PAGE_SOFTW3 | _PAGE_SOFTW4;
|
||||
const pteval_t flush_on_change = _PAGE_RW | _PAGE_USER | _PAGE_PWT |
|
||||
_PAGE_PCD | _PAGE_PSE | _PAGE_GLOBAL | _PAGE_PAT |
|
||||
_PAGE_PAT_LARGE | _PAGE_PKEY_BIT0 | _PAGE_PKEY_BIT1 |
|
||||
_PAGE_PKEY_BIT2 | _PAGE_PKEY_BIT3 | _PAGE_NX;
|
||||
unsigned long diff = oldflags ^ newflags;
|
||||
|
||||
BUILD_BUG_ON(flush_on_clear & software_flags);
|
||||
BUILD_BUG_ON(flush_on_clear & flush_on_change);
|
||||
BUILD_BUG_ON(flush_on_change & software_flags);
|
||||
|
||||
/* Ignore software flags */
|
||||
diff &= ~software_flags;
|
||||
|
||||
if (ignore_access)
|
||||
diff &= ~_PAGE_ACCESSED;
|
||||
|
||||
/*
|
||||
* Did any of the 'flush_on_clear' flags was clleared set from between
|
||||
* 'oldflags' and 'newflags'?
|
||||
*/
|
||||
if (diff & oldflags & flush_on_clear)
|
||||
return true;
|
||||
|
||||
/* Flush on modified flags. */
|
||||
if (diff & flush_on_change)
|
||||
return true;
|
||||
|
||||
/* Ensure there are no flags that were left behind */
|
||||
if (IS_ENABLED(CONFIG_DEBUG_VM) &&
|
||||
(diff & ~(flush_on_clear | software_flags | flush_on_change))) {
|
||||
VM_WARN_ON_ONCE(1);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
/*
|
||||
* pte_needs_flush() checks whether permissions were demoted and require a
|
||||
* flush. It should only be used for userspace PTEs.
|
||||
*/
|
||||
static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
|
||||
{
|
||||
/* !PRESENT -> * ; no need for flush */
|
||||
if (!(pte_flags(oldpte) & _PAGE_PRESENT))
|
||||
return false;
|
||||
|
||||
/* PFN changed ; needs flush */
|
||||
if (pte_pfn(oldpte) != pte_pfn(newpte))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* check PTE flags; ignore access-bit; see comment in
|
||||
* ptep_clear_flush_young().
|
||||
*/
|
||||
return pte_flags_need_flush(pte_flags(oldpte), pte_flags(newpte),
|
||||
true);
|
||||
}
|
||||
#define pte_needs_flush pte_needs_flush
|
||||
|
||||
/*
|
||||
* huge_pmd_needs_flush() checks whether permissions were demoted and require a
|
||||
* flush. It should only be used for userspace huge PMDs.
|
||||
*/
|
||||
static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
|
||||
{
|
||||
/* !PRESENT -> * ; no need for flush */
|
||||
if (!(pmd_flags(oldpmd) & _PAGE_PRESENT))
|
||||
return false;
|
||||
|
||||
/* PFN changed ; needs flush */
|
||||
if (pmd_pfn(oldpmd) != pmd_pfn(newpmd))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* check PMD flags; do not ignore access-bit; see
|
||||
* pmdp_clear_flush_young().
|
||||
*/
|
||||
return pte_flags_need_flush(pmd_flags(oldpmd), pmd_flags(newpmd),
|
||||
false);
|
||||
}
|
||||
#define huge_pmd_needs_flush huge_pmd_needs_flush
|
||||
|
||||
#endif /* !MODULE */
|
||||
|
||||
static inline void __native_tlb_flush_global(unsigned long cr4)
|
||||
|
||||
@@ -5,20 +5,6 @@
|
||||
#define MAP_32BIT 0x40 /* only give out 32bit addresses */
|
||||
|
||||
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
|
||||
/*
|
||||
* Take the 4 protection key bits out of the vma->vm_flags
|
||||
* value and turn them in to the bits that we can put in
|
||||
* to a pte.
|
||||
*
|
||||
* Only override these if Protection Keys are available
|
||||
* (which is only on 64-bit).
|
||||
*/
|
||||
#define arch_vm_get_page_prot(vm_flags) __pgprot( \
|
||||
((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \
|
||||
((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
|
||||
((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
|
||||
((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
|
||||
|
||||
#define arch_calc_vm_prot_bits(prot, key) ( \
|
||||
((key) & 0x1 ? VM_PKEY_BIT0 : 0) | \
|
||||
((key) & 0x2 ? VM_PKEY_BIT1 : 0) | \
|
||||
|
||||
@@ -20,7 +20,7 @@ CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
|
||||
endif
|
||||
|
||||
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \
|
||||
pgtable.o physaddr.o tlb.o cpu_entry_area.o maccess.o
|
||||
pgtable.o physaddr.o tlb.o cpu_entry_area.o maccess.o pgprot.o
|
||||
|
||||
obj-y += pat/
|
||||
|
||||
|
||||
@@ -1269,7 +1269,7 @@ static struct kcore_list kcore_vsyscall;
|
||||
|
||||
static void __init register_page_bootmem_info(void)
|
||||
{
|
||||
#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_FREE_VMEMMAP)
|
||||
#if defined(CONFIG_NUMA) || defined(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP)
|
||||
int i;
|
||||
|
||||
for_each_online_node(i)
|
||||
|
||||
35
arch/x86/mm/pgprot.c
Normal file
35
arch/x86/mm/pgprot.c
Normal file
@@ -0,0 +1,35 @@
|
||||
// SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
#include <linux/export.h>
|
||||
#include <linux/mm.h>
|
||||
#include <asm/pgtable.h>
|
||||
|
||||
pgprot_t vm_get_page_prot(unsigned long vm_flags)
|
||||
{
|
||||
unsigned long val = pgprot_val(protection_map[vm_flags &
|
||||
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)]);
|
||||
|
||||
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
|
||||
/*
|
||||
* Take the 4 protection key bits out of the vma->vm_flags value and
|
||||
* turn them in to the bits that we can put in to a pte.
|
||||
*
|
||||
* Only override these if Protection Keys are available (which is only
|
||||
* on 64-bit).
|
||||
*/
|
||||
if (vm_flags & VM_PKEY_BIT0)
|
||||
val |= _PAGE_PKEY_BIT0;
|
||||
if (vm_flags & VM_PKEY_BIT1)
|
||||
val |= _PAGE_PKEY_BIT1;
|
||||
if (vm_flags & VM_PKEY_BIT2)
|
||||
val |= _PAGE_PKEY_BIT2;
|
||||
if (vm_flags & VM_PKEY_BIT3)
|
||||
val |= _PAGE_PKEY_BIT3;
|
||||
#endif
|
||||
|
||||
val = __sme_set(val);
|
||||
if (val & _PAGE_PRESENT)
|
||||
val &= __supported_pte_mask;
|
||||
return __pgprot(val);
|
||||
}
|
||||
EXPORT_SYMBOL(vm_get_page_prot);
|
||||
@@ -608,6 +608,16 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
|
||||
|
||||
return young;
|
||||
}
|
||||
|
||||
pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address,
|
||||
pmd_t *pmdp)
|
||||
{
|
||||
/*
|
||||
* No flush is necessary. Once an invalid PTE is established, the PTE's
|
||||
* access and dirty bits cannot be updated.
|
||||
*/
|
||||
return pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
@@ -676,9 +686,8 @@ int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
|
||||
*
|
||||
* No 512GB pages yet -- always return 0
|
||||
*/
|
||||
int p4d_clear_huge(p4d_t *p4d)
|
||||
void p4d_clear_huge(p4d_t *p4d)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@@ -636,10 +636,9 @@ static int __add_memory_block(struct memory_block *memory)
|
||||
}
|
||||
ret = xa_err(xa_store(&memory_blocks, memory->dev.id, memory,
|
||||
GFP_KERNEL));
|
||||
if (ret) {
|
||||
put_device(&memory->dev);
|
||||
if (ret)
|
||||
device_unregister(&memory->dev);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
@@ -682,6 +682,7 @@ static int register_node(struct node *node, int num)
|
||||
*/
|
||||
void unregister_node(struct node *node)
|
||||
{
|
||||
compaction_unregister_node(node);
|
||||
hugetlb_unregister_node(node); /* no-op, if memoryless node */
|
||||
node_remove_accesses(node);
|
||||
node_remove_caches(node);
|
||||
|
||||
@@ -190,8 +190,8 @@ static void __loop_update_dio(struct loop_device *lo, bool dio)
|
||||
*/
|
||||
if (dio) {
|
||||
if (queue_logical_block_size(lo->lo_queue) >= sb_bsize &&
|
||||
!(lo->lo_offset & dio_align) &&
|
||||
mapping->a_ops->direct_IO)
|
||||
!(lo->lo_offset & dio_align) &&
|
||||
(file->f_mode & FMODE_CAN_ODIRECT))
|
||||
use_dio = true;
|
||||
else
|
||||
use_dio = false;
|
||||
|
||||
@@ -1,8 +1,9 @@
|
||||
# SPDX-License-Identifier: GPL-2.0
|
||||
config ZRAM
|
||||
tristate "Compressed RAM block device support"
|
||||
depends on BLOCK && SYSFS && ZSMALLOC && CRYPTO
|
||||
depends on BLOCK && SYSFS && MMU
|
||||
depends on CRYPTO_LZO || CRYPTO_ZSTD || CRYPTO_LZ4 || CRYPTO_LZ4HC || CRYPTO_842
|
||||
select ZSMALLOC
|
||||
help
|
||||
Creates virtual block devices called /dev/zramX (X = 0, 1, ...).
|
||||
Pages written to these disks are compressed and stored in memory
|
||||
|
||||
@@ -639,8 +639,8 @@ static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
|
||||
#define PAGE_WB_SIG "page_index="
|
||||
|
||||
#define PAGE_WRITEBACK 0
|
||||
#define HUGE_WRITEBACK 1
|
||||
#define IDLE_WRITEBACK 2
|
||||
#define HUGE_WRITEBACK (1<<0)
|
||||
#define IDLE_WRITEBACK (1<<1)
|
||||
|
||||
|
||||
static ssize_t writeback_store(struct device *dev,
|
||||
@@ -660,6 +660,8 @@ static ssize_t writeback_store(struct device *dev,
|
||||
mode = IDLE_WRITEBACK;
|
||||
else if (sysfs_streq(buf, "huge"))
|
||||
mode = HUGE_WRITEBACK;
|
||||
else if (sysfs_streq(buf, "huge_idle"))
|
||||
mode = IDLE_WRITEBACK | HUGE_WRITEBACK;
|
||||
else {
|
||||
if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1))
|
||||
return -EINVAL;
|
||||
@@ -721,10 +723,10 @@ static ssize_t writeback_store(struct device *dev,
|
||||
zram_test_flag(zram, index, ZRAM_UNDER_WB))
|
||||
goto next;
|
||||
|
||||
if (mode == IDLE_WRITEBACK &&
|
||||
if (mode & IDLE_WRITEBACK &&
|
||||
!zram_test_flag(zram, index, ZRAM_IDLE))
|
||||
goto next;
|
||||
if (mode == HUGE_WRITEBACK &&
|
||||
if (mode & HUGE_WRITEBACK &&
|
||||
!zram_test_flag(zram, index, ZRAM_HUGE))
|
||||
goto next;
|
||||
/*
|
||||
@@ -1142,15 +1144,14 @@ static ssize_t bd_stat_show(struct device *dev,
|
||||
static ssize_t debug_stat_show(struct device *dev,
|
||||
struct device_attribute *attr, char *buf)
|
||||
{
|
||||
int version = 1;
|
||||
int version = 2;
|
||||
struct zram *zram = dev_to_zram(dev);
|
||||
ssize_t ret;
|
||||
|
||||
down_read(&zram->init_lock);
|
||||
ret = scnprintf(buf, PAGE_SIZE,
|
||||
"version: %d\n%8llu %8llu\n",
|
||||
"version: %d\n%8llu\n",
|
||||
version,
|
||||
(u64)atomic64_read(&zram->stats.writestall),
|
||||
(u64)atomic64_read(&zram->stats.miss_free));
|
||||
up_read(&zram->init_lock);
|
||||
|
||||
@@ -1366,7 +1367,6 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
|
||||
}
|
||||
kunmap_atomic(mem);
|
||||
|
||||
compress_again:
|
||||
zstrm = zcomp_stream_get(zram->comp);
|
||||
src = kmap_atomic(page);
|
||||
ret = zcomp_compress(zstrm, src, &comp_len);
|
||||
@@ -1375,39 +1375,20 @@ compress_again:
|
||||
if (unlikely(ret)) {
|
||||
zcomp_stream_put(zram->comp);
|
||||
pr_err("Compression failed! err=%d\n", ret);
|
||||
zs_free(zram->mem_pool, handle);
|
||||
return ret;
|
||||
}
|
||||
|
||||
if (comp_len >= huge_class_size)
|
||||
comp_len = PAGE_SIZE;
|
||||
/*
|
||||
* handle allocation has 2 paths:
|
||||
* a) fast path is executed with preemption disabled (for
|
||||
* per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
|
||||
* since we can't sleep;
|
||||
* b) slow path enables preemption and attempts to allocate
|
||||
* the page with __GFP_DIRECT_RECLAIM bit set. we have to
|
||||
* put per-cpu compression stream and, thus, to re-do
|
||||
* the compression once handle is allocated.
|
||||
*
|
||||
* if we have a 'non-null' handle here then we are coming
|
||||
* from the slow path and handle has already been allocated.
|
||||
*/
|
||||
if (!handle)
|
||||
handle = zs_malloc(zram->mem_pool, comp_len,
|
||||
__GFP_KSWAPD_RECLAIM |
|
||||
__GFP_NOWARN |
|
||||
__GFP_HIGHMEM |
|
||||
__GFP_MOVABLE);
|
||||
if (!handle) {
|
||||
|
||||
handle = zs_malloc(zram->mem_pool, comp_len,
|
||||
__GFP_KSWAPD_RECLAIM |
|
||||
__GFP_NOWARN |
|
||||
__GFP_HIGHMEM |
|
||||
__GFP_MOVABLE);
|
||||
|
||||
if (unlikely(!handle)) {
|
||||
zcomp_stream_put(zram->comp);
|
||||
atomic64_inc(&zram->stats.writestall);
|
||||
handle = zs_malloc(zram->mem_pool, comp_len,
|
||||
GFP_NOIO | __GFP_HIGHMEM |
|
||||
__GFP_MOVABLE);
|
||||
if (handle)
|
||||
goto compress_again;
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
@@ -1965,7 +1946,6 @@ static int zram_add(void)
|
||||
if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
|
||||
blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
|
||||
|
||||
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
|
||||
ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
|
||||
if (ret)
|
||||
goto out_cleanup_disk;
|
||||
|
||||
@@ -81,7 +81,6 @@ struct zram_stats {
|
||||
atomic64_t huge_pages_since; /* no. of huge pages since zram set up */
|
||||
atomic64_t pages_stored; /* no. of pages currently stored */
|
||||
atomic_long_t max_used_pages; /* no. of maximum pages stored */
|
||||
atomic64_t writestall; /* no. of write slow paths */
|
||||
atomic64_t miss_free; /* no. of missed free */
|
||||
#ifdef CONFIG_ZRAM_WRITEBACK
|
||||
atomic64_t bd_count; /* no. of pages in backing device */
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
|
||||
#include <linux/arm-smccc.h>
|
||||
#include <linux/bitmap.h>
|
||||
#include <linux/cache.h>
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/string.h>
|
||||
|
||||
|
||||
@@ -184,7 +184,7 @@ static int kgd_hqd_load(struct amdgpu_device *adev, void *mqd,
|
||||
|
||||
/* read_user_ptr may take the mm->mmap_lock.
|
||||
* release srbm_mutex to avoid circular dependency between
|
||||
* srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex.
|
||||
* srbm_mutex->mmap_lock->reservation_ww_class_mutex->srbm_mutex.
|
||||
*/
|
||||
release_queue(adev);
|
||||
valid_wptr = read_user_wptr(mm, wptr, wptr_val);
|
||||
|
||||
@@ -208,7 +208,7 @@ static int kgd_hqd_load(struct amdgpu_device *adev, void *mqd,
|
||||
|
||||
/* read_user_ptr may take the mm->mmap_lock.
|
||||
* release srbm_mutex to avoid circular dependency between
|
||||
* srbm_mutex->mm_sem->reservation_ww_class_mutex->srbm_mutex.
|
||||
* srbm_mutex->mmap_lock->reservation_ww_class_mutex->srbm_mutex.
|
||||
*/
|
||||
release_queue(adev);
|
||||
valid_wptr = read_user_wptr(mm, wptr, wptr_val);
|
||||
|
||||
@@ -102,7 +102,7 @@ static unsigned long ttm_bo_io_mem_pfn(struct ttm_buffer_object *bo,
|
||||
* @bo: The buffer object
|
||||
* @vmf: The fault structure handed to the callback
|
||||
*
|
||||
* vm callbacks like fault() and *_mkwrite() allow for the mm_sem to be dropped
|
||||
* vm callbacks like fault() and *_mkwrite() allow for the mmap_lock to be dropped
|
||||
* during long waits, and after the wait the callback will be restarted. This
|
||||
* is to allow other threads using the same virtual memory space concurrent
|
||||
* access to map(), unmap() completely unrelated buffer objects. TTM buffer
|
||||
|
||||
@@ -2476,10 +2476,10 @@ static int virtio_mem_init_hotplug(struct virtio_mem *vm)
|
||||
VIRTIO_MEM_DEFAULT_OFFLINE_THRESHOLD);
|
||||
|
||||
/*
|
||||
* TODO: once alloc_contig_range() works reliably with pageblock
|
||||
* granularity on ZONE_NORMAL, use pageblock_nr_pages instead.
|
||||
* alloc_contig_range() works reliably with pageblock
|
||||
* granularity on ZONE_NORMAL, use pageblock_nr_pages.
|
||||
*/
|
||||
sb_size = PAGE_SIZE * MAX_ORDER_NR_PAGES;
|
||||
sb_size = PAGE_SIZE * pageblock_nr_pages;
|
||||
sb_size = max_t(uint64_t, vm->device_block_size, sb_size);
|
||||
|
||||
if (sb_size < memory_block_size_bytes() && !force_bbm) {
|
||||
|
||||
22
fs/Kconfig
22
fs/Kconfig
@@ -245,19 +245,27 @@ config HUGETLBFS
|
||||
config HUGETLB_PAGE
|
||||
def_bool HUGETLBFS
|
||||
|
||||
config HUGETLB_PAGE_FREE_VMEMMAP
|
||||
#
|
||||
# Select this config option from the architecture Kconfig, if it is preferred
|
||||
# to enable the feature of minimizing overhead of struct page associated with
|
||||
# each HugeTLB page.
|
||||
#
|
||||
config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||
bool
|
||||
|
||||
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||
def_bool HUGETLB_PAGE
|
||||
depends on X86_64
|
||||
depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||
depends on SPARSEMEM_VMEMMAP
|
||||
|
||||
config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON
|
||||
bool "Default freeing vmemmap pages of HugeTLB to on"
|
||||
config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON
|
||||
bool "Default optimizing vmemmap pages of HugeTLB to on"
|
||||
default n
|
||||
depends on HUGETLB_PAGE_FREE_VMEMMAP
|
||||
depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||
help
|
||||
When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap
|
||||
When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap
|
||||
pages associated with each HugeTLB page is default off. Say Y here
|
||||
to enable freeing vmemmap pages of HugeTLB by default. It can then
|
||||
to enable optimizing vmemmap pages of HugeTLB by default. It can then
|
||||
be disabled on the command line via hugetlb_free_vmemmap=off.
|
||||
|
||||
config MEMFD_CREATE
|
||||
|
||||
@@ -4906,6 +4906,10 @@ static int cifs_swap_activate(struct swap_info_struct *sis,
|
||||
|
||||
cifs_dbg(FYI, "swap activate\n");
|
||||
|
||||
if (!swap_file->f_mapping->a_ops->swap_rw)
|
||||
/* Cannot support swap */
|
||||
return -EINVAL;
|
||||
|
||||
spin_lock(&inode->i_lock);
|
||||
blocks = inode->i_blocks;
|
||||
isize = inode->i_size;
|
||||
@@ -4934,7 +4938,8 @@ static int cifs_swap_activate(struct swap_info_struct *sis,
|
||||
* from reading or writing the file
|
||||
*/
|
||||
|
||||
return 0;
|
||||
sis->flags |= SWP_FS_OPS;
|
||||
return add_swap_extent(sis, 0, sis->max, 0);
|
||||
}
|
||||
|
||||
static void cifs_swap_deactivate(struct file *file)
|
||||
|
||||
98
fs/dax.c
98
fs/dax.c
@@ -24,6 +24,7 @@
|
||||
#include <linux/sizes.h>
|
||||
#include <linux/mmu_notifier.h>
|
||||
#include <linux/iomap.h>
|
||||
#include <linux/rmap.h>
|
||||
#include <asm/pgalloc.h>
|
||||
|
||||
#define CREATE_TRACE_POINTS
|
||||
@@ -789,95 +790,12 @@ static void *dax_insert_entry(struct xa_state *xas,
|
||||
return entry;
|
||||
}
|
||||
|
||||
static inline
|
||||
unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
|
||||
{
|
||||
unsigned long address;
|
||||
|
||||
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
|
||||
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
|
||||
return address;
|
||||
}
|
||||
|
||||
/* Walk all mappings of a given index of a file and writeprotect them */
|
||||
static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index,
|
||||
unsigned long pfn)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
pte_t pte, *ptep = NULL;
|
||||
pmd_t *pmdp = NULL;
|
||||
spinlock_t *ptl;
|
||||
|
||||
i_mmap_lock_read(mapping);
|
||||
vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
|
||||
struct mmu_notifier_range range;
|
||||
unsigned long address;
|
||||
|
||||
cond_resched();
|
||||
|
||||
if (!(vma->vm_flags & VM_SHARED))
|
||||
continue;
|
||||
|
||||
address = pgoff_address(index, vma);
|
||||
|
||||
/*
|
||||
* follow_invalidate_pte() will use the range to call
|
||||
* mmu_notifier_invalidate_range_start() on our behalf before
|
||||
* taking any lock.
|
||||
*/
|
||||
if (follow_invalidate_pte(vma->vm_mm, address, &range, &ptep,
|
||||
&pmdp, &ptl))
|
||||
continue;
|
||||
|
||||
/*
|
||||
* No need to call mmu_notifier_invalidate_range() as we are
|
||||
* downgrading page table protection not changing it to point
|
||||
* to a new page.
|
||||
*
|
||||
* See Documentation/vm/mmu_notifier.rst
|
||||
*/
|
||||
if (pmdp) {
|
||||
#ifdef CONFIG_FS_DAX_PMD
|
||||
pmd_t pmd;
|
||||
|
||||
if (pfn != pmd_pfn(*pmdp))
|
||||
goto unlock_pmd;
|
||||
if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
|
||||
goto unlock_pmd;
|
||||
|
||||
flush_cache_page(vma, address, pfn);
|
||||
pmd = pmdp_invalidate(vma, address, pmdp);
|
||||
pmd = pmd_wrprotect(pmd);
|
||||
pmd = pmd_mkclean(pmd);
|
||||
set_pmd_at(vma->vm_mm, address, pmdp, pmd);
|
||||
unlock_pmd:
|
||||
#endif
|
||||
spin_unlock(ptl);
|
||||
} else {
|
||||
if (pfn != pte_pfn(*ptep))
|
||||
goto unlock_pte;
|
||||
if (!pte_dirty(*ptep) && !pte_write(*ptep))
|
||||
goto unlock_pte;
|
||||
|
||||
flush_cache_page(vma, address, pfn);
|
||||
pte = ptep_clear_flush(vma, address, ptep);
|
||||
pte = pte_wrprotect(pte);
|
||||
pte = pte_mkclean(pte);
|
||||
set_pte_at(vma->vm_mm, address, ptep, pte);
|
||||
unlock_pte:
|
||||
pte_unmap_unlock(ptep, ptl);
|
||||
}
|
||||
|
||||
mmu_notifier_invalidate_range_end(&range);
|
||||
}
|
||||
i_mmap_unlock_read(mapping);
|
||||
}
|
||||
|
||||
static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
|
||||
struct address_space *mapping, void *entry)
|
||||
{
|
||||
unsigned long pfn, index, count;
|
||||
unsigned long pfn, index, count, end;
|
||||
long ret = 0;
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
/*
|
||||
* A page got tagged dirty in DAX mapping? Something is seriously
|
||||
@@ -935,8 +853,16 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev,
|
||||
pfn = dax_to_pfn(entry);
|
||||
count = 1UL << dax_entry_order(entry);
|
||||
index = xas->xa_index & ~(count - 1);
|
||||
end = index + count - 1;
|
||||
|
||||
/* Walk all mappings of a given index of a file and writeprotect them */
|
||||
i_mmap_lock_read(mapping);
|
||||
vma_interval_tree_foreach(vma, &mapping->i_mmap, index, end) {
|
||||
pfn_mkclean_range(pfn, count, index, vma);
|
||||
cond_resched();
|
||||
}
|
||||
i_mmap_unlock_read(mapping);
|
||||
|
||||
dax_entry_mkclean(mapping, index, pfn);
|
||||
dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE);
|
||||
/*
|
||||
* After we have flushed the cache, we can clear the dirty tag. There
|
||||
|
||||
@@ -758,6 +758,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
|
||||
unsigned long stack_size;
|
||||
unsigned long stack_expand;
|
||||
unsigned long rlim_stack;
|
||||
struct mmu_gather tlb;
|
||||
|
||||
#ifdef CONFIG_STACK_GROWSUP
|
||||
/* Limit stack size */
|
||||
@@ -812,8 +813,11 @@ int setup_arg_pages(struct linux_binprm *bprm,
|
||||
vm_flags |= mm->def_flags;
|
||||
vm_flags |= VM_STACK_INCOMPLETE_SETUP;
|
||||
|
||||
ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
|
||||
tlb_gather_mmu(&tlb, mm);
|
||||
ret = mprotect_fixup(&tlb, vma, &prev, vma->vm_start, vma->vm_end,
|
||||
vm_flags);
|
||||
tlb_finish_mmu(&tlb);
|
||||
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
BUG_ON(prev != vma);
|
||||
|
||||
@@ -56,11 +56,10 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
|
||||
arg |= O_NONBLOCK;
|
||||
|
||||
/* Pipe packetized mode is controlled by O_DIRECT flag */
|
||||
if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) {
|
||||
if (!filp->f_mapping || !filp->f_mapping->a_ops ||
|
||||
!filp->f_mapping->a_ops->direct_IO)
|
||||
return -EINVAL;
|
||||
}
|
||||
if (!S_ISFIFO(inode->i_mode) &&
|
||||
(arg & O_DIRECT) &&
|
||||
!(filp->f_mode & FMODE_CAN_ODIRECT))
|
||||
return -EINVAL;
|
||||
|
||||
if (filp->f_op->check_flags)
|
||||
error = filp->f_op->check_flags(arg);
|
||||
|
||||
@@ -405,7 +405,8 @@ static void remove_huge_page(struct page *page)
|
||||
}
|
||||
|
||||
static void
|
||||
hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
|
||||
hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
|
||||
zap_flags_t zap_flags)
|
||||
{
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
@@ -439,7 +440,7 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
|
||||
}
|
||||
|
||||
unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
|
||||
NULL);
|
||||
NULL, zap_flags);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -517,7 +518,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
|
||||
mutex_lock(&hugetlb_fault_mutex_table[hash]);
|
||||
hugetlb_vmdelete_list(&mapping->i_mmap,
|
||||
index * pages_per_huge_page(h),
|
||||
(index + 1) * pages_per_huge_page(h));
|
||||
(index + 1) * pages_per_huge_page(h),
|
||||
ZAP_FLAG_DROP_MARKER);
|
||||
i_mmap_unlock_write(mapping);
|
||||
}
|
||||
|
||||
@@ -583,7 +585,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
|
||||
i_mmap_lock_write(mapping);
|
||||
i_size_write(inode, offset);
|
||||
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
|
||||
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
|
||||
hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
|
||||
ZAP_FLAG_DROP_MARKER);
|
||||
i_mmap_unlock_write(mapping);
|
||||
remove_inode_hugepages(inode, offset, LLONG_MAX);
|
||||
}
|
||||
@@ -616,8 +619,8 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
|
||||
i_mmap_lock_write(mapping);
|
||||
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
|
||||
hugetlb_vmdelete_list(&mapping->i_mmap,
|
||||
hole_start >> PAGE_SHIFT,
|
||||
hole_end >> PAGE_SHIFT);
|
||||
hole_start >> PAGE_SHIFT,
|
||||
hole_end >> PAGE_SHIFT, 0);
|
||||
i_mmap_unlock_write(mapping);
|
||||
remove_inode_hugepages(inode, hole_start, hole_end);
|
||||
inode_unlock(inode);
|
||||
@@ -1048,12 +1051,12 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
|
||||
if (sbinfo->spool) {
|
||||
long free_pages;
|
||||
|
||||
spin_lock(&sbinfo->spool->lock);
|
||||
spin_lock_irq(&sbinfo->spool->lock);
|
||||
buf->f_blocks = sbinfo->spool->max_hpages;
|
||||
free_pages = sbinfo->spool->max_hpages
|
||||
- sbinfo->spool->used_hpages;
|
||||
buf->f_bavail = buf->f_bfree = free_pages;
|
||||
spin_unlock(&sbinfo->spool->lock);
|
||||
spin_unlock_irq(&sbinfo->spool->lock);
|
||||
buf->f_files = sbinfo->max_inodes;
|
||||
buf->f_ffree = sbinfo->free_inodes;
|
||||
}
|
||||
|
||||
@@ -153,28 +153,25 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq,
|
||||
}
|
||||
|
||||
/**
|
||||
* nfs_direct_IO - NFS address space operation for direct I/O
|
||||
* nfs_swap_rw - NFS address space operation for swap I/O
|
||||
* @iocb: target I/O control block
|
||||
* @iter: I/O buffer
|
||||
*
|
||||
* The presence of this routine in the address space ops vector means
|
||||
* the NFS client supports direct I/O. However, for most direct IO, we
|
||||
* shunt off direct read and write requests before the VFS gets them,
|
||||
* so this method is only ever called for swap.
|
||||
* Perform IO to the swap-file. This is much like direct IO.
|
||||
*/
|
||||
ssize_t nfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
||||
int nfs_swap_rw(struct kiocb *iocb, struct iov_iter *iter)
|
||||
{
|
||||
struct inode *inode = iocb->ki_filp->f_mapping->host;
|
||||
|
||||
/* we only support swap file calling nfs_direct_IO */
|
||||
if (!IS_SWAPFILE(inode))
|
||||
return 0;
|
||||
ssize_t ret;
|
||||
|
||||
VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
|
||||
|
||||
if (iov_iter_rw(iter) == READ)
|
||||
return nfs_file_direct_read(iocb, iter, true);
|
||||
return nfs_file_direct_write(iocb, iter, true);
|
||||
ret = nfs_file_direct_read(iocb, iter, true);
|
||||
else
|
||||
ret = nfs_file_direct_write(iocb, iter, true);
|
||||
if (ret < 0)
|
||||
return ret;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
|
||||
|
||||
@@ -69,6 +69,8 @@ nfs_file_open(struct inode *inode, struct file *filp)
|
||||
return res;
|
||||
|
||||
res = nfs_open(inode, filp);
|
||||
if (res == 0)
|
||||
filp->f_mode |= FMODE_CAN_ODIRECT;
|
||||
return res;
|
||||
}
|
||||
|
||||
@@ -480,6 +482,7 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
||||
{
|
||||
unsigned long blocks;
|
||||
long long isize;
|
||||
int ret;
|
||||
struct inode *inode = file_inode(file);
|
||||
struct rpc_clnt *clnt = NFS_CLIENT(inode);
|
||||
struct nfs_client *cl = NFS_SERVER(inode)->nfs_client;
|
||||
@@ -493,13 +496,22 @@ static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
*span = sis->pages;
|
||||
ret = rpc_clnt_swap_activate(clnt);
|
||||
if (ret)
|
||||
return ret;
|
||||
ret = add_swap_extent(sis, 0, sis->max, 0);
|
||||
if (ret < 0) {
|
||||
rpc_clnt_swap_deactivate(clnt);
|
||||
return ret;
|
||||
}
|
||||
|
||||
*span = sis->pages;
|
||||
|
||||
if (cl->rpc_ops->enable_swap)
|
||||
cl->rpc_ops->enable_swap(inode);
|
||||
|
||||
return rpc_clnt_swap_activate(clnt);
|
||||
sis->flags |= SWP_FS_OPS;
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void nfs_swap_deactivate(struct file *file)
|
||||
@@ -523,7 +535,6 @@ const struct address_space_operations nfs_file_aops = {
|
||||
.write_end = nfs_write_end,
|
||||
.invalidate_folio = nfs_invalidate_folio,
|
||||
.release_folio = nfs_release_folio,
|
||||
.direct_IO = nfs_direct_IO,
|
||||
#ifdef CONFIG_MIGRATION
|
||||
.migratepage = nfs_migrate_page,
|
||||
#endif
|
||||
@@ -532,6 +543,7 @@ const struct address_space_operations nfs_file_aops = {
|
||||
.error_remove_page = generic_error_remove_page,
|
||||
.swap_activate = nfs_swap_activate,
|
||||
.swap_deactivate = nfs_swap_deactivate,
|
||||
.swap_rw = nfs_swap_rw,
|
||||
};
|
||||
|
||||
/*
|
||||
|
||||
@@ -834,16 +834,15 @@ static int do_dentry_open(struct file *f,
|
||||
if ((f->f_mode & FMODE_WRITE) &&
|
||||
likely(f->f_op->write || f->f_op->write_iter))
|
||||
f->f_mode |= FMODE_CAN_WRITE;
|
||||
if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
|
||||
f->f_mode |= FMODE_CAN_ODIRECT;
|
||||
|
||||
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
|
||||
|
||||
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
|
||||
|
||||
/* NB: we're sure to have correct a_ops only after f_op->open */
|
||||
if (f->f_flags & O_DIRECT) {
|
||||
if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
|
||||
return -EINVAL;
|
||||
}
|
||||
if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* XXX: Huge page cache doesn't support writing yet. Drop all page
|
||||
|
||||
@@ -82,11 +82,8 @@ static int ovl_change_flags(struct file *file, unsigned int flags)
|
||||
if (((flags ^ file->f_flags) & O_APPEND) && IS_APPEND(inode))
|
||||
return -EPERM;
|
||||
|
||||
if (flags & O_DIRECT) {
|
||||
if (!file->f_mapping->a_ops ||
|
||||
!file->f_mapping->a_ops->direct_IO)
|
||||
return -EINVAL;
|
||||
}
|
||||
if ((flags & O_DIRECT) && !(file->f_mode & FMODE_CAN_ODIRECT))
|
||||
return -EINVAL;
|
||||
|
||||
if (file->f_op->check_flags) {
|
||||
err = file->f_op->check_flags(flags);
|
||||
@@ -306,8 +303,7 @@ static ssize_t ovl_read_iter(struct kiocb *iocb, struct iov_iter *iter)
|
||||
|
||||
ret = -EINVAL;
|
||||
if (iocb->ki_flags & IOCB_DIRECT &&
|
||||
(!real.file->f_mapping->a_ops ||
|
||||
!real.file->f_mapping->a_ops->direct_IO))
|
||||
!(real.file->f_mode & FMODE_CAN_ODIRECT))
|
||||
goto out_fdput;
|
||||
|
||||
old_cred = ovl_override_creds(file_inode(file)->i_sb);
|
||||
@@ -367,8 +363,7 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
|
||||
|
||||
ret = -EINVAL;
|
||||
if (iocb->ki_flags & IOCB_DIRECT &&
|
||||
(!real.file->f_mapping->a_ops ||
|
||||
!real.file->f_mapping->a_ops->direct_IO))
|
||||
!(real.file->f_mode & FMODE_CAN_ODIRECT))
|
||||
goto out_fdput;
|
||||
|
||||
if (!ovl_should_sync(OVL_FS(inode->i_sb)))
|
||||
|
||||
@@ -3154,6 +3154,22 @@ static int proc_pid_patch_state(struct seq_file *m, struct pid_namespace *ns,
|
||||
}
|
||||
#endif /* CONFIG_LIVEPATCH */
|
||||
|
||||
#ifdef CONFIG_KSM
|
||||
static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace *ns,
|
||||
struct pid *pid, struct task_struct *task)
|
||||
{
|
||||
struct mm_struct *mm;
|
||||
|
||||
mm = get_task_mm(task);
|
||||
if (mm) {
|
||||
seq_printf(m, "%lu\n", mm->ksm_merging_pages);
|
||||
mmput(mm);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif /* CONFIG_KSM */
|
||||
|
||||
#ifdef CONFIG_STACKLEAK_METRICS
|
||||
static int proc_stack_depth(struct seq_file *m, struct pid_namespace *ns,
|
||||
struct pid *pid, struct task_struct *task)
|
||||
@@ -3285,6 +3301,9 @@ static const struct pid_entry tgid_base_stuff[] = {
|
||||
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
|
||||
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
|
||||
#endif
|
||||
#ifdef CONFIG_KSM
|
||||
ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
|
||||
#endif
|
||||
};
|
||||
|
||||
static int proc_tgid_base_readdir(struct file *file, struct dir_context *ctx)
|
||||
@@ -3618,6 +3637,9 @@ static const struct pid_entry tid_base_stuff[] = {
|
||||
#ifdef CONFIG_SECCOMP_CACHE_DEBUG
|
||||
ONE("seccomp_cache", S_IRUSR, proc_pid_seccomp_cache),
|
||||
#endif
|
||||
#ifdef CONFIG_KSM
|
||||
ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages),
|
||||
#endif
|
||||
};
|
||||
|
||||
static int proc_tid_base_readdir(struct file *file, struct dir_context *ctx)
|
||||
|
||||
@@ -86,6 +86,13 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
|
||||
|
||||
show_val_kb(m, "SwapTotal: ", i.totalswap);
|
||||
show_val_kb(m, "SwapFree: ", i.freeswap);
|
||||
#ifdef CONFIG_ZSWAP
|
||||
seq_printf(m, "Zswap: %8lu kB\n",
|
||||
(unsigned long)(zswap_pool_total_size >> 10));
|
||||
seq_printf(m, "Zswapped: %8lu kB\n",
|
||||
(unsigned long)atomic_read(&zswap_stored_pages) <<
|
||||
(PAGE_SHIFT - 10));
|
||||
#endif
|
||||
show_val_kb(m, "Dirty: ",
|
||||
global_node_page_state(NR_FILE_DIRTY));
|
||||
show_val_kb(m, "Writeback: ",
|
||||
|
||||
@@ -1421,6 +1421,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
|
||||
migration = is_migration_entry(entry);
|
||||
if (is_pfn_swap_entry(entry))
|
||||
page = pfn_swap_entry_to_page(entry);
|
||||
if (pte_marker_entry_uffd_wp(entry))
|
||||
flags |= PM_UFFD_WP;
|
||||
}
|
||||
|
||||
if (page && !PageAnon(page))
|
||||
@@ -1556,10 +1558,15 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
|
||||
if (page_mapcount(page) == 1)
|
||||
flags |= PM_MMAP_EXCLUSIVE;
|
||||
|
||||
if (huge_pte_uffd_wp(pte))
|
||||
flags |= PM_UFFD_WP;
|
||||
|
||||
flags |= PM_PRESENT;
|
||||
if (pm->show_pfn)
|
||||
frame = pte_pfn(pte) +
|
||||
((addr & ~hmask) >> PAGE_SHIFT);
|
||||
} else if (pte_swp_uffd_wp_any(pte)) {
|
||||
flags |= PM_UFFD_WP;
|
||||
}
|
||||
|
||||
for (; addr != end; addr += PAGE_SIZE) {
|
||||
@@ -1873,8 +1880,6 @@ static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
|
||||
return 0;
|
||||
|
||||
page = pte_page(huge_pte);
|
||||
if (!page)
|
||||
return 0;
|
||||
|
||||
md = walk->private;
|
||||
gather_stats(page, md, pte_dirty(huge_pte), 1);
|
||||
|
||||
@@ -29,6 +29,7 @@
|
||||
#include <linux/ioctl.h>
|
||||
#include <linux/security.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/swapops.h>
|
||||
|
||||
int sysctl_unprivileged_userfaultfd __read_mostly;
|
||||
|
||||
@@ -249,9 +250,10 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
|
||||
|
||||
/*
|
||||
* Lockless access: we're in a wait_event so it's ok if it
|
||||
* changes under us.
|
||||
* changes under us. PTE markers should be handled the same as none
|
||||
* ptes here.
|
||||
*/
|
||||
if (huge_pte_none(pte))
|
||||
if (huge_pte_none_mostly(pte))
|
||||
ret = true;
|
||||
if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
|
||||
ret = true;
|
||||
@@ -330,9 +332,10 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
|
||||
pte = pte_offset_map(pmd, address);
|
||||
/*
|
||||
* Lockless access: we're in a wait_event so it's ok if it
|
||||
* changes under us.
|
||||
* changes under us. PTE markers should be handled the same as none
|
||||
* ptes here.
|
||||
*/
|
||||
if (pte_none(*pte))
|
||||
if (pte_none_mostly(*pte))
|
||||
ret = true;
|
||||
if (!pte_write(*pte) && (reason & VM_UFFD_WP))
|
||||
ret = true;
|
||||
@@ -1255,24 +1258,6 @@ static __always_inline int validate_range(struct mm_struct *mm,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline bool vma_can_userfault(struct vm_area_struct *vma,
|
||||
unsigned long vm_flags)
|
||||
{
|
||||
/* FIXME: add WP support to hugetlbfs and shmem */
|
||||
if (vm_flags & VM_UFFD_WP) {
|
||||
if (is_vm_hugetlb_page(vma) || vma_is_shmem(vma))
|
||||
return false;
|
||||
}
|
||||
|
||||
if (vm_flags & VM_UFFD_MINOR) {
|
||||
if (!(is_vm_hugetlb_page(vma) || vma_is_shmem(vma)))
|
||||
return false;
|
||||
}
|
||||
|
||||
return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) ||
|
||||
vma_is_shmem(vma);
|
||||
}
|
||||
|
||||
static int userfaultfd_register(struct userfaultfd_ctx *ctx,
|
||||
unsigned long arg)
|
||||
{
|
||||
@@ -1953,6 +1938,9 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
|
||||
#endif
|
||||
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP
|
||||
uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
|
||||
#endif
|
||||
#ifndef CONFIG_PTE_MARKER_UFFD_WP
|
||||
uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
|
||||
#endif
|
||||
uffdio_api.ioctls = UFFD_API_IOCTLS;
|
||||
ret = -EFAULT;
|
||||
|
||||
@@ -2,6 +2,9 @@
|
||||
#ifndef _ASM_GENERIC_HUGETLB_H
|
||||
#define _ASM_GENERIC_HUGETLB_H
|
||||
|
||||
#include <linux/swap.h>
|
||||
#include <linux/swapops.h>
|
||||
|
||||
static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot)
|
||||
{
|
||||
return mk_pte(page, pgprot);
|
||||
@@ -32,6 +35,21 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot)
|
||||
return pte_modify(pte, newprot);
|
||||
}
|
||||
|
||||
static inline pte_t huge_pte_mkuffd_wp(pte_t pte)
|
||||
{
|
||||
return pte_mkuffd_wp(pte);
|
||||
}
|
||||
|
||||
static inline pte_t huge_pte_clear_uffd_wp(pte_t pte)
|
||||
{
|
||||
return pte_clear_uffd_wp(pte);
|
||||
}
|
||||
|
||||
static inline int huge_pte_uffd_wp(pte_t pte)
|
||||
{
|
||||
return pte_uffd_wp(pte);
|
||||
}
|
||||
|
||||
#ifndef __HAVE_ARCH_HUGE_PTE_CLEAR
|
||||
static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, unsigned long sz)
|
||||
@@ -66,10 +84,10 @@ static inline pte_t huge_ptep_get_and_clear(struct mm_struct *mm,
|
||||
#endif
|
||||
|
||||
#ifndef __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH
|
||||
static inline void huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
{
|
||||
ptep_clear_flush(vma, addr, ptep);
|
||||
return ptep_clear_flush(vma, addr, ptep);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -80,6 +98,12 @@ static inline int huge_pte_none(pte_t pte)
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Please refer to comments above pte_none_mostly() for the usage */
|
||||
static inline int huge_pte_none_mostly(pte_t pte)
|
||||
{
|
||||
return huge_pte_none(pte) || is_pte_marker(pte);
|
||||
}
|
||||
|
||||
#ifndef __HAVE_ARCH_HUGE_PTE_WRPROTECT
|
||||
static inline pte_t huge_pte_wrprotect(pte_t pte)
|
||||
{
|
||||
|
||||
@@ -30,6 +30,8 @@ typedef struct { pud_t pud; } pmd_t;
|
||||
static inline int pud_none(pud_t pud) { return 0; }
|
||||
static inline int pud_bad(pud_t pud) { return 0; }
|
||||
static inline int pud_present(pud_t pud) { return 1; }
|
||||
static inline int pud_user(pud_t pud) { return 0; }
|
||||
static inline int pud_leaf(pud_t pud) { return 0; }
|
||||
static inline void pud_clear(pud_t *pud) { }
|
||||
#define pmd_ERROR(pmd) (pud_ERROR((pmd).pud))
|
||||
|
||||
|
||||
@@ -658,6 +658,20 @@ static inline void tlb_flush_p4d_range(struct mmu_gather *tlb,
|
||||
} while (0)
|
||||
#endif
|
||||
|
||||
#ifndef pte_needs_flush
|
||||
static inline bool pte_needs_flush(pte_t oldpte, pte_t newpte)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef huge_pmd_needs_flush
|
||||
static inline bool huge_pmd_needs_flush(pmd_t oldpmd, pmd_t newpmd)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* CONFIG_MMU */
|
||||
|
||||
#endif /* _ASM_GENERIC__TLB_H */
|
||||
|
||||
@@ -17,11 +17,11 @@
|
||||
#define CMA_MAX_NAME 64
|
||||
|
||||
/*
|
||||
* TODO: once the buddy -- especially pageblock merging and alloc_contig_range()
|
||||
* the buddy -- especially pageblock merging and alloc_contig_range()
|
||||
* -- can deal with only some pageblocks of a higher-order page being
|
||||
* MIGRATE_CMA, we can use pageblock_nr_pages.
|
||||
*/
|
||||
#define CMA_MIN_ALIGNMENT_PAGES MAX_ORDER_NR_PAGES
|
||||
#define CMA_MIN_ALIGNMENT_PAGES pageblock_nr_pages
|
||||
#define CMA_MIN_ALIGNMENT_BYTES (PAGE_SIZE * CMA_MIN_ALIGNMENT_PAGES)
|
||||
|
||||
struct cma;
|
||||
|
||||
@@ -177,7 +177,7 @@ static inline bool compaction_withdrawn(enum compact_result result)
|
||||
bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
|
||||
int alloc_flags);
|
||||
|
||||
extern int kcompactd_run(int nid);
|
||||
extern void kcompactd_run(int nid);
|
||||
extern void kcompactd_stop(int nid);
|
||||
extern void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx);
|
||||
|
||||
@@ -212,9 +212,8 @@ static inline bool compaction_withdrawn(enum compact_result result)
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline int kcompactd_run(int nid)
|
||||
static inline void kcompactd_run(int nid)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline void kcompactd_stop(int nid)
|
||||
{
|
||||
|
||||
@@ -261,10 +261,14 @@ struct damos {
|
||||
* enum damon_ops_id - Identifier for each monitoring operations implementation
|
||||
*
|
||||
* @DAMON_OPS_VADDR: Monitoring operations for virtual address spaces
|
||||
* @DAMON_OPS_FVADDR: Monitoring operations for only fixed ranges of virtual
|
||||
* address spaces
|
||||
* @DAMON_OPS_PADDR: Monitoring operations for the physical address space
|
||||
* @NR_DAMON_OPS: Number of monitoring operations implementations
|
||||
*/
|
||||
enum damon_ops_id {
|
||||
DAMON_OPS_VADDR,
|
||||
DAMON_OPS_FVADDR,
|
||||
DAMON_OPS_PADDR,
|
||||
NR_DAMON_OPS,
|
||||
};
|
||||
@@ -340,6 +344,7 @@ struct damon_operations {
|
||||
* struct damon_callback - Monitoring events notification callbacks.
|
||||
*
|
||||
* @before_start: Called before starting the monitoring.
|
||||
* @after_wmarks_check: Called after each schemes' watermarks check.
|
||||
* @after_sampling: Called after each sampling.
|
||||
* @after_aggregation: Called after each aggregation.
|
||||
* @before_terminate: Called before terminating the monitoring.
|
||||
@@ -350,6 +355,11 @@ struct damon_operations {
|
||||
* respectively. Therefore, those are good places for installing and cleaning
|
||||
* @private.
|
||||
*
|
||||
* The monitoring thread calls @after_wmarks_check after each DAMON-based
|
||||
* operation schemes' watermarks check. If users need to make changes to the
|
||||
* attributes of the monitoring context while it's deactivated due to the
|
||||
* watermarks, this is the good place to do.
|
||||
*
|
||||
* The monitoring thread calls @after_sampling and @after_aggregation for each
|
||||
* of the sampling intervals and aggregation intervals, respectively.
|
||||
* Therefore, users can safely access the monitoring results without additional
|
||||
@@ -362,6 +372,7 @@ struct damon_callback {
|
||||
void *private;
|
||||
|
||||
int (*before_start)(struct damon_ctx *context);
|
||||
int (*after_wmarks_check)(struct damon_ctx *context);
|
||||
int (*after_sampling)(struct damon_ctx *context);
|
||||
int (*after_aggregation)(struct damon_ctx *context);
|
||||
void (*before_terminate)(struct damon_ctx *context);
|
||||
@@ -484,6 +495,8 @@ static inline void damon_insert_region(struct damon_region *r,
|
||||
|
||||
void damon_add_region(struct damon_region *r, struct damon_target *t);
|
||||
void damon_destroy_region(struct damon_region *r, struct damon_target *t);
|
||||
int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges,
|
||||
unsigned int nr_ranges);
|
||||
|
||||
struct damos *damon_new_scheme(
|
||||
unsigned long min_sz_region, unsigned long max_sz_region,
|
||||
@@ -509,6 +522,7 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int,
|
||||
int damon_set_schemes(struct damon_ctx *ctx,
|
||||
struct damos **schemes, ssize_t nr_schemes);
|
||||
int damon_nr_running_ctxs(void);
|
||||
bool damon_is_registered_ops(enum damon_ops_id id);
|
||||
int damon_register_ops(struct damon_operations *ops);
|
||||
int damon_select_ops(struct damon_ctx *ctx, enum damon_ops_id id);
|
||||
|
||||
|
||||
@@ -20,6 +20,7 @@ struct fault_attr {
|
||||
atomic_t space;
|
||||
unsigned long verbose;
|
||||
bool task_filter;
|
||||
bool no_warn;
|
||||
unsigned long stacktrace_depth;
|
||||
unsigned long require_start;
|
||||
unsigned long require_end;
|
||||
@@ -39,6 +40,7 @@ struct fault_attr {
|
||||
.ratelimit_state = RATELIMIT_STATE_INIT_DISABLED, \
|
||||
.verbose = 2, \
|
||||
.dname = NULL, \
|
||||
.no_warn = false, \
|
||||
}
|
||||
|
||||
#define DECLARE_FAULT_ATTR(name) struct fault_attr name = FAULT_ATTR_INITIALIZER
|
||||
|
||||
@@ -162,6 +162,9 @@ typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
|
||||
/* File is stream-like */
|
||||
#define FMODE_STREAM ((__force fmode_t)0x200000)
|
||||
|
||||
/* File supports DIRECT IO */
|
||||
#define FMODE_CAN_ODIRECT ((__force fmode_t)0x400000)
|
||||
|
||||
/* File was opened by fanotify and shouldn't generate fanotify events */
|
||||
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
|
||||
|
||||
@@ -376,6 +379,7 @@ struct address_space_operations {
|
||||
int (*swap_activate)(struct swap_info_struct *sis, struct file *file,
|
||||
sector_t *span);
|
||||
void (*swap_deactivate)(struct file *file);
|
||||
int (*swap_rw)(struct kiocb *iocb, struct iov_iter *iter);
|
||||
};
|
||||
|
||||
extern const struct address_space_operations empty_aops;
|
||||
@@ -457,6 +461,11 @@ static inline void i_mmap_unlock_write(struct address_space *mapping)
|
||||
up_write(&mapping->i_mmap_rwsem);
|
||||
}
|
||||
|
||||
static inline int i_mmap_trylock_read(struct address_space *mapping)
|
||||
{
|
||||
return down_read_trylock(&mapping->i_mmap_rwsem);
|
||||
}
|
||||
|
||||
static inline void i_mmap_lock_read(struct address_space *mapping)
|
||||
{
|
||||
down_read(&mapping->i_mmap_rwsem);
|
||||
|
||||
@@ -367,7 +367,7 @@ static inline int gfp_migratetype(const gfp_t gfp_flags)
|
||||
return MIGRATE_UNMOVABLE;
|
||||
|
||||
/* Group based on mobility */
|
||||
return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
|
||||
return (__force unsigned long)(gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT;
|
||||
}
|
||||
#undef GFP_MOVABLE_MASK
|
||||
#undef GFP_MOVABLE_SHIFT
|
||||
@@ -613,13 +613,8 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
|
||||
#ifdef CONFIG_NUMA
|
||||
struct page *alloc_pages(gfp_t gfp, unsigned int order);
|
||||
struct folio *folio_alloc(gfp_t gfp, unsigned order);
|
||||
struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
|
||||
struct vm_area_struct *vma, unsigned long addr,
|
||||
bool hugepage);
|
||||
struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma,
|
||||
unsigned long addr, bool hugepage);
|
||||
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
|
||||
alloc_pages_vma(gfp_mask, order, vma, addr, true)
|
||||
#else
|
||||
static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
|
||||
{
|
||||
@@ -629,16 +624,17 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
|
||||
{
|
||||
return __folio_alloc_node(gfp, order, numa_node_id());
|
||||
}
|
||||
#define alloc_pages_vma(gfp_mask, order, vma, addr, hugepage) \
|
||||
alloc_pages(gfp_mask, order)
|
||||
#define vma_alloc_folio(gfp, order, vma, addr, hugepage) \
|
||||
folio_alloc(gfp, order)
|
||||
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
|
||||
alloc_pages(gfp_mask, order)
|
||||
#endif
|
||||
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
|
||||
#define alloc_page_vma(gfp_mask, vma, addr) \
|
||||
alloc_pages_vma(gfp_mask, 0, vma, addr, false)
|
||||
static inline struct page *alloc_page_vma(gfp_t gfp,
|
||||
struct vm_area_struct *vma, unsigned long addr)
|
||||
{
|
||||
struct folio *folio = vma_alloc_folio(gfp, 0, vma, addr, false);
|
||||
|
||||
return &folio->page;
|
||||
}
|
||||
|
||||
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
|
||||
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
|
||||
|
||||
@@ -246,9 +246,21 @@ static inline bool is_kmap_addr(const void *x)
|
||||
|
||||
#endif /* CONFIG_HIGHMEM */
|
||||
|
||||
/*
|
||||
* Prevent people trying to call kunmap_atomic() as if it were kunmap()
|
||||
* kunmap_atomic() should get the return value of kmap_atomic, not the page.
|
||||
/**
|
||||
* kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() - deprecated!
|
||||
* @__addr: Virtual address to be unmapped
|
||||
*
|
||||
* Unmaps an address previously mapped by kmap_atomic() and re-enables
|
||||
* pagefaults. Depending on PREEMP_RT configuration, re-enables also
|
||||
* migration and preemption. Users should not count on these side effects.
|
||||
*
|
||||
* Mappings should be unmapped in the reverse order that they were mapped.
|
||||
* See kmap_local_page() for details on nesting.
|
||||
*
|
||||
* @__addr can be any address within the mapped page, so there is no need
|
||||
* to subtract any offset that has been added. In contrast to kunmap(),
|
||||
* this function takes the address returned from kmap_atomic(), not the
|
||||
* page passed to it. The compiler will warn you if you pass the page.
|
||||
*/
|
||||
#define kunmap_atomic(__addr) \
|
||||
do { \
|
||||
|
||||
@@ -37,7 +37,7 @@ static inline void *kmap(struct page *page);
|
||||
|
||||
/**
|
||||
* kunmap - Unmap the virtual address mapped by kmap()
|
||||
* @addr: Virtual address to be unmapped
|
||||
* @page: Pointer to the page which was mapped by kmap()
|
||||
*
|
||||
* Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of
|
||||
* pages in the low memory area.
|
||||
@@ -138,24 +138,47 @@ static inline void *kmap_local_folio(struct folio *folio, size_t offset);
|
||||
*
|
||||
* Returns: The virtual address of the mapping
|
||||
*
|
||||
* Effectively a wrapper around kmap_local_page() which disables pagefaults
|
||||
* and preemption.
|
||||
* In fact a wrapper around kmap_local_page() which also disables pagefaults
|
||||
* and, depending on PREEMPT_RT configuration, also CPU migration and
|
||||
* preemption. Therefore users should not count on the latter two side effects.
|
||||
*
|
||||
* Mappings should always be released by kunmap_atomic().
|
||||
*
|
||||
* Do not use in new code. Use kmap_local_page() instead.
|
||||
*
|
||||
* It is used in atomic context when code wants to access the contents of a
|
||||
* page that might be allocated from high memory (see __GFP_HIGHMEM), for
|
||||
* example a page in the pagecache. The API has two functions, and they
|
||||
* can be used in a manner similar to the following:
|
||||
*
|
||||
* -- Find the page of interest. --
|
||||
* struct page *page = find_get_page(mapping, offset);
|
||||
*
|
||||
* -- Gain access to the contents of that page. --
|
||||
* void *vaddr = kmap_atomic(page);
|
||||
*
|
||||
* -- Do something to the contents of that page. --
|
||||
* memset(vaddr, 0, PAGE_SIZE);
|
||||
*
|
||||
* -- Unmap that page. --
|
||||
* kunmap_atomic(vaddr);
|
||||
*
|
||||
* Note that the kunmap_atomic() call takes the result of the kmap_atomic()
|
||||
* call, not the argument.
|
||||
*
|
||||
* If you need to map two pages because you want to copy from one page to
|
||||
* another you need to keep the kmap_atomic calls strictly nested, like:
|
||||
*
|
||||
* vaddr1 = kmap_atomic(page1);
|
||||
* vaddr2 = kmap_atomic(page2);
|
||||
*
|
||||
* memcpy(vaddr1, vaddr2, PAGE_SIZE);
|
||||
*
|
||||
* kunmap_atomic(vaddr2);
|
||||
* kunmap_atomic(vaddr1);
|
||||
*/
|
||||
static inline void *kmap_atomic(struct page *page);
|
||||
|
||||
/**
|
||||
* kunmap_atomic - Unmap the virtual address mapped by kmap_atomic()
|
||||
* @addr: Virtual address to be unmapped
|
||||
*
|
||||
* Counterpart to kmap_atomic().
|
||||
*
|
||||
* Effectively a wrapper around kunmap_local() which additionally undoes
|
||||
* the side effects of kmap_atomic(), i.e. reenabling pagefaults and
|
||||
* preemption.
|
||||
*/
|
||||
|
||||
/* Highmem related interfaces for management code */
|
||||
static inline unsigned int nr_free_highpages(void);
|
||||
static inline unsigned long totalhigh_pages(void);
|
||||
@@ -191,6 +214,8 @@ static inline void clear_user_highpage(struct page *page, unsigned long vaddr)
|
||||
* @vma: The VMA the page is to be allocated for
|
||||
* @vaddr: The virtual address the page will be inserted into
|
||||
*
|
||||
* Returns: The allocated and zeroed HIGHMEM page
|
||||
*
|
||||
* This function will allocate a page for a VMA that the caller knows will
|
||||
* be able to migrate in the future using move_pages() or reclaimed
|
||||
*
|
||||
@@ -358,6 +383,8 @@ static inline void memcpy_to_page(struct page *page, size_t offset,
|
||||
static inline void memzero_page(struct page *page, size_t offset, size_t len)
|
||||
{
|
||||
char *addr = kmap_local_page(page);
|
||||
|
||||
VM_BUG_ON(offset + len > PAGE_SIZE);
|
||||
memset(addr + offset, 0, len);
|
||||
flush_dcache_page(page);
|
||||
kunmap_local(addr);
|
||||
|
||||
@@ -36,8 +36,9 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud,
|
||||
unsigned long addr);
|
||||
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
|
||||
unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd);
|
||||
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
|
||||
pgprot_t newprot, unsigned long cp_flags);
|
||||
int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
|
||||
pmd_t *pmd, unsigned long addr, pgprot_t newprot,
|
||||
unsigned long cp_flags);
|
||||
vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
|
||||
pgprot_t pgprot, bool write);
|
||||
|
||||
@@ -172,6 +173,20 @@ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma)
|
||||
return false;
|
||||
}
|
||||
|
||||
static inline bool file_thp_enabled(struct vm_area_struct *vma)
|
||||
{
|
||||
struct inode *inode;
|
||||
|
||||
if (!vma->vm_file)
|
||||
return false;
|
||||
|
||||
inode = vma->vm_file->f_inode;
|
||||
|
||||
return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) &&
|
||||
(vma->vm_flags & VM_EXEC) &&
|
||||
!inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
|
||||
}
|
||||
|
||||
bool transparent_hugepage_active(struct vm_area_struct *vma);
|
||||
|
||||
#define transparent_hugepage_use_zero_page() \
|
||||
@@ -347,7 +362,6 @@ static inline void prep_transhuge_page(struct page *page) {}
|
||||
static inline bool
|
||||
can_split_folio(struct folio *folio, int *pextra_pins)
|
||||
{
|
||||
BUILD_BUG();
|
||||
return false;
|
||||
}
|
||||
static inline int
|
||||
|
||||
@@ -137,17 +137,19 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma,
|
||||
struct vm_area_struct *new_vma,
|
||||
unsigned long old_addr, unsigned long new_addr,
|
||||
unsigned long len);
|
||||
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
|
||||
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *,
|
||||
struct vm_area_struct *, struct vm_area_struct *);
|
||||
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
|
||||
struct page **, struct vm_area_struct **,
|
||||
unsigned long *, unsigned long *, long, unsigned int,
|
||||
int *);
|
||||
void unmap_hugepage_range(struct vm_area_struct *,
|
||||
unsigned long, unsigned long, struct page *);
|
||||
unsigned long, unsigned long, struct page *,
|
||||
zap_flags_t);
|
||||
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
|
||||
struct vm_area_struct *vma,
|
||||
unsigned long start, unsigned long end,
|
||||
struct page *ref_page);
|
||||
struct page *ref_page, zap_flags_t zap_flags);
|
||||
void hugetlb_report_meminfo(struct seq_file *);
|
||||
int hugetlb_report_node_meminfo(char *buf, int len, int nid);
|
||||
void hugetlb_show_meminfo(void);
|
||||
@@ -160,7 +162,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
|
||||
unsigned long dst_addr,
|
||||
unsigned long src_addr,
|
||||
enum mcopy_atomic_mode mode,
|
||||
struct page **pagep);
|
||||
struct page **pagep,
|
||||
bool wp_copy);
|
||||
#endif /* CONFIG_USERFAULTFD */
|
||||
bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
|
||||
struct vm_area_struct *vma,
|
||||
@@ -210,7 +213,8 @@ struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address,
|
||||
int pmd_huge(pmd_t pmd);
|
||||
int pud_huge(pud_t pud);
|
||||
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
|
||||
unsigned long address, unsigned long end, pgprot_t newprot);
|
||||
unsigned long address, unsigned long end, pgprot_t newprot,
|
||||
unsigned long cp_flags);
|
||||
|
||||
bool is_hugetlb_entry_migration(pte_t pte);
|
||||
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
|
||||
@@ -266,7 +270,9 @@ static inline struct page *follow_huge_addr(struct mm_struct *mm,
|
||||
}
|
||||
|
||||
static inline int copy_hugetlb_page_range(struct mm_struct *dst,
|
||||
struct mm_struct *src, struct vm_area_struct *vma)
|
||||
struct mm_struct *src,
|
||||
struct vm_area_struct *dst_vma,
|
||||
struct vm_area_struct *src_vma)
|
||||
{
|
||||
BUG();
|
||||
return 0;
|
||||
@@ -356,7 +362,8 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
|
||||
unsigned long dst_addr,
|
||||
unsigned long src_addr,
|
||||
enum mcopy_atomic_mode mode,
|
||||
struct page **pagep)
|
||||
struct page **pagep,
|
||||
bool wp_copy)
|
||||
{
|
||||
BUG();
|
||||
return 0;
|
||||
@@ -395,14 +402,16 @@ static inline void move_hugetlb_state(struct page *oldpage,
|
||||
|
||||
static inline unsigned long hugetlb_change_protection(
|
||||
struct vm_area_struct *vma, unsigned long address,
|
||||
unsigned long end, pgprot_t newprot)
|
||||
unsigned long end, pgprot_t newprot,
|
||||
unsigned long cp_flags)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb,
|
||||
struct vm_area_struct *vma, unsigned long start,
|
||||
unsigned long end, struct page *ref_page)
|
||||
unsigned long end, struct page *ref_page,
|
||||
zap_flags_t zap_flags)
|
||||
{
|
||||
BUG();
|
||||
}
|
||||
@@ -623,8 +632,8 @@ struct hstate {
|
||||
unsigned int nr_huge_pages_node[MAX_NUMNODES];
|
||||
unsigned int free_huge_pages_node[MAX_NUMNODES];
|
||||
unsigned int surplus_huge_pages_node[MAX_NUMNODES];
|
||||
#ifdef CONFIG_HUGETLB_PAGE_FREE_VMEMMAP
|
||||
unsigned int nr_free_vmemmap_pages;
|
||||
#ifdef CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP
|
||||
unsigned int optimize_vmemmap_pages;
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUP_HUGETLB
|
||||
/* cgroup control files */
|
||||
@@ -1084,6 +1093,17 @@ static inline void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr
|
||||
pte_t *ptep, pte_t pte, unsigned long sz)
|
||||
{
|
||||
}
|
||||
|
||||
static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
{
|
||||
return *ptep;
|
||||
}
|
||||
|
||||
static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, pte_t pte)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_HUGETLB_PAGE */
|
||||
|
||||
static inline spinlock_t *huge_pte_lock(struct hstate *h,
|
||||
|
||||
@@ -23,10 +23,10 @@ struct task_struct;
|
||||
|
||||
typedef unsigned int __bitwise kasan_vmalloc_flags_t;
|
||||
|
||||
#define KASAN_VMALLOC_NONE 0x00u
|
||||
#define KASAN_VMALLOC_INIT 0x01u
|
||||
#define KASAN_VMALLOC_VM_ALLOC 0x02u
|
||||
#define KASAN_VMALLOC_PROT_NORMAL 0x04u
|
||||
#define KASAN_VMALLOC_NONE ((__force kasan_vmalloc_flags_t)0x00u)
|
||||
#define KASAN_VMALLOC_INIT ((__force kasan_vmalloc_flags_t)0x01u)
|
||||
#define KASAN_VMALLOC_VM_ALLOC ((__force kasan_vmalloc_flags_t)0x02u)
|
||||
#define KASAN_VMALLOC_PROT_NORMAL ((__force kasan_vmalloc_flags_t)0x04u)
|
||||
|
||||
#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user