Files
lkl_linux/arch/x86/include/asm/mmu_context.h
Rick Edgecombe b2926a36b9 x86/shstk: Handle thread shadow stack
When a process is duplicated, but the child shares the address space with
the parent, there is potential for the threads sharing a single stack to
cause conflicts for each other. In the normal non-CET case this is handled
in two ways.

With regular CLONE_VM a new stack is provided by userspace such that the
parent and child have different stacks.

For vfork, the parent is suspended until the child exits. So as long as
the child doesn't return from the vfork()/CLONE_VFORK calling function and
sticks to a limited set of operations, the parent and child can share the
same stack.

For shadow stack, these scenarios present similar sharing problems. For the
CLONE_VM case, the child and the parent must have separate shadow stacks.
Instead of changing clone to take a shadow stack, have the kernel just
allocate one and switch to it.

Use stack_size passed from clone3() syscall for thread shadow stack size. A
compat-mode thread shadow stack size is further reduced to 1/4. This
allows more threads to run in a 32-bit address space. The clone() does not
pass stack_size, which was added to clone3(). In that case, use
RLIMIT_STACK size and cap to 4 GB.

For shadow stack enabled vfork(), the parent and child can share the same
shadow stack, like they can share a normal stack. Since the parent is
suspended until the child terminates, the child will not interfere with
the parent while executing as long as it doesn't return from the vfork()
and overwrite up the shadow stack. The child can safely overwrite down
the shadow stack, as the parent can just overwrite this later. So CET does
not add any additional limitations for vfork().

Free the shadow stack on thread exit by doing it in mm_release(). Skip
this when exiting a vfork() child since the stack is shared in the
parent.

During this operation, the shadow stack pointer of the new thread needs
to be updated to point to the newly allocated shadow stack. Since the
ability to do this is confined to the FPU subsystem, change
fpu_clone() to take the new shadow stack pointer, and update it
internally inside the FPU subsystem. This part was suggested by Thomas
Gleixner.

Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
Reviewed-by: Kees Cook <keescook@chromium.org>
Acked-by: Mike Rapoport (IBM) <rppt@kernel.org>
Tested-by: Pengfei Xu <pengfei.xu@intel.com>
Tested-by: John Allen <john.allen@amd.com>
Tested-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/all/20230613001108.3040476-30-rick.p.edgecombe%40intel.com
2023-08-02 15:01:50 -07:00

267 lines
6.8 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _ASM_X86_MMU_CONTEXT_H
#define _ASM_X86_MMU_CONTEXT_H
#include <asm/desc.h>
#include <linux/atomic.h>
#include <linux/mm_types.h>
#include <linux/pkeys.h>
#include <trace/events/tlb.h>
#include <asm/tlbflush.h>
#include <asm/paravirt.h>
#include <asm/debugreg.h>
#include <asm/gsseg.h>
extern atomic64_t last_mm_ctx_id;
#ifdef CONFIG_PERF_EVENTS
DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
void cr4_update_pce(void *ignored);
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
/*
* ldt_structs can be allocated, used, and freed, but they are never
* modified while live.
*/
struct ldt_struct {
/*
* Xen requires page-aligned LDTs with special permissions. This is
* needed to prevent us from installing evil descriptors such as
* call gates. On native, we could merge the ldt_struct and LDT
* allocations, but it's not worth trying to optimize.
*/
struct desc_struct *entries;
unsigned int nr_entries;
/*
* If PTI is in use, then the entries array is not mapped while we're
* in user mode. The whole array will be aliased at the addressed
* given by ldt_slot_va(slot). We use two slots so that we can allocate
* and map, and enable a new LDT without invalidating the mapping
* of an older, still-in-use LDT.
*
* slot will be -1 if this LDT doesn't have an alias mapping.
*/
int slot;
};
/*
* Used for LDT copy/destruction.
*/
static inline void init_new_context_ldt(struct mm_struct *mm)
{
mm->context.ldt = NULL;
init_rwsem(&mm->context.ldt_usr_sem);
}
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
void destroy_context_ldt(struct mm_struct *mm);
void ldt_arch_exit_mmap(struct mm_struct *mm);
#else /* CONFIG_MODIFY_LDT_SYSCALL */
static inline void init_new_context_ldt(struct mm_struct *mm) { }
static inline int ldt_dup_context(struct mm_struct *oldmm,
struct mm_struct *mm)
{
return 0;
}
static inline void destroy_context_ldt(struct mm_struct *mm) { }
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
#endif
#ifdef CONFIG_MODIFY_LDT_SYSCALL
extern void load_mm_ldt(struct mm_struct *mm);
extern void switch_ldt(struct mm_struct *prev, struct mm_struct *next);
#else
static inline void load_mm_ldt(struct mm_struct *mm)
{
clear_LDT();
}
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
{
DEBUG_LOCKS_WARN_ON(preemptible());
}
#endif
#ifdef CONFIG_ADDRESS_MASKING
static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
return mm->context.lam_cr3_mask;
}
static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
{
mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
mm->context.untag_mask = oldmm->context.untag_mask;
}
#define mm_untag_mask mm_untag_mask
static inline unsigned long mm_untag_mask(struct mm_struct *mm)
{
return mm->context.untag_mask;
}
static inline void mm_reset_untag_mask(struct mm_struct *mm)
{
mm->context.untag_mask = -1UL;
}
#define arch_pgtable_dma_compat arch_pgtable_dma_compat
static inline bool arch_pgtable_dma_compat(struct mm_struct *mm)
{
return !mm_lam_cr3_mask(mm) ||
test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags);
}
#else
static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
{
return 0;
}
static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
{
}
static inline void mm_reset_untag_mask(struct mm_struct *mm)
{
}
#endif
#define enter_lazy_tlb enter_lazy_tlb
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
/*
* Init a new mm. Used on mm copies, like at fork()
* and on mm's that are brand-new, like at execve().
*/
#define init_new_context init_new_context
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
mutex_init(&mm->context.lock);
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
atomic64_set(&mm->context.tlb_gen, 0);
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
/* pkey 0 is the default and allocated implicitly */
mm->context.pkey_allocation_map = 0x1;
/* -1 means unallocated or invalid */
mm->context.execute_only_pkey = -1;
}
#endif
mm_reset_untag_mask(mm);
init_new_context_ldt(mm);
return 0;
}
#define destroy_context destroy_context
static inline void destroy_context(struct mm_struct *mm)
{
destroy_context_ldt(mm);
}
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk);
extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk);
#define switch_mm_irqs_off switch_mm_irqs_off
#define activate_mm(prev, next) \
do { \
paravirt_enter_mmap(next); \
switch_mm((prev), (next), NULL); \
} while (0);
#ifdef CONFIG_X86_32
#define deactivate_mm(tsk, mm) \
do { \
loadsegment(gs, 0); \
} while (0)
#else
#define deactivate_mm(tsk, mm) \
do { \
if (!tsk->vfork_done) \
shstk_free(tsk); \
load_gs_index(0); \
loadsegment(fs, 0); \
} while (0)
#endif
static inline void arch_dup_pkeys(struct mm_struct *oldmm,
struct mm_struct *mm)
{
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return;
/* Duplicate the oldmm pkey state in mm: */
mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
#endif
}
static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
{
arch_dup_pkeys(oldmm, mm);
paravirt_enter_mmap(mm);
dup_lam(oldmm, mm);
return ldt_dup_context(oldmm, mm);
}
static inline void arch_exit_mmap(struct mm_struct *mm)
{
paravirt_arch_exit_mmap(mm);
ldt_arch_exit_mmap(mm);
}
#ifdef CONFIG_X86_64
static inline bool is_64bit_mm(struct mm_struct *mm)
{
return !IS_ENABLED(CONFIG_IA32_EMULATION) ||
!test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags);
}
#else
static inline bool is_64bit_mm(struct mm_struct *mm)
{
return false;
}
#endif
static inline void arch_unmap(struct mm_struct *mm, unsigned long start,
unsigned long end)
{
}
/*
* We only want to enforce protection keys on the current process
* because we effectively have no access to PKRU for other
* processes or any way to tell *which * PKRU in a threaded
* process we could use.
*
* So do not enforce things if the VMA is not from the current
* mm, or if we are in a kernel thread.
*/
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
bool write, bool execute, bool foreign)
{
/* pkeys never affect instruction fetches */
if (execute)
return true;
/* allow access if the VMA is not one from this process */
if (foreign || vma_is_foreign(vma))
return true;
return __pkru_allows_pkey(vma_pkey(vma), write);
}
unsigned long __get_current_cr3_fast(void);
#include <asm-generic/mmu_context.h>
#endif /* _ASM_X86_MMU_CONTEXT_H */