mirror of
https://github.com/lkl/linux.git
synced 2025-12-19 08:03:01 +09:00
When a process is duplicated, but the child shares the address space with the parent, there is potential for the threads sharing a single stack to cause conflicts for each other. In the normal non-CET case this is handled in two ways. With regular CLONE_VM a new stack is provided by userspace such that the parent and child have different stacks. For vfork, the parent is suspended until the child exits. So as long as the child doesn't return from the vfork()/CLONE_VFORK calling function and sticks to a limited set of operations, the parent and child can share the same stack. For shadow stack, these scenarios present similar sharing problems. For the CLONE_VM case, the child and the parent must have separate shadow stacks. Instead of changing clone to take a shadow stack, have the kernel just allocate one and switch to it. Use stack_size passed from clone3() syscall for thread shadow stack size. A compat-mode thread shadow stack size is further reduced to 1/4. This allows more threads to run in a 32-bit address space. The clone() does not pass stack_size, which was added to clone3(). In that case, use RLIMIT_STACK size and cap to 4 GB. For shadow stack enabled vfork(), the parent and child can share the same shadow stack, like they can share a normal stack. Since the parent is suspended until the child terminates, the child will not interfere with the parent while executing as long as it doesn't return from the vfork() and overwrite up the shadow stack. The child can safely overwrite down the shadow stack, as the parent can just overwrite this later. So CET does not add any additional limitations for vfork(). Free the shadow stack on thread exit by doing it in mm_release(). Skip this when exiting a vfork() child since the stack is shared in the parent. During this operation, the shadow stack pointer of the new thread needs to be updated to point to the newly allocated shadow stack. Since the ability to do this is confined to the FPU subsystem, change fpu_clone() to take the new shadow stack pointer, and update it internally inside the FPU subsystem. This part was suggested by Thomas Gleixner. Co-developed-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Suggested-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com> Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de> Reviewed-by: Kees Cook <keescook@chromium.org> Acked-by: Mike Rapoport (IBM) <rppt@kernel.org> Tested-by: Pengfei Xu <pengfei.xu@intel.com> Tested-by: John Allen <john.allen@amd.com> Tested-by: Kees Cook <keescook@chromium.org> Link: https://lore.kernel.org/all/20230613001108.3040476-30-rick.p.edgecombe%40intel.com
267 lines
6.8 KiB
C
267 lines
6.8 KiB
C
/* SPDX-License-Identifier: GPL-2.0 */
|
|
#ifndef _ASM_X86_MMU_CONTEXT_H
|
|
#define _ASM_X86_MMU_CONTEXT_H
|
|
|
|
#include <asm/desc.h>
|
|
#include <linux/atomic.h>
|
|
#include <linux/mm_types.h>
|
|
#include <linux/pkeys.h>
|
|
|
|
#include <trace/events/tlb.h>
|
|
|
|
#include <asm/tlbflush.h>
|
|
#include <asm/paravirt.h>
|
|
#include <asm/debugreg.h>
|
|
#include <asm/gsseg.h>
|
|
|
|
extern atomic64_t last_mm_ctx_id;
|
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
|
DECLARE_STATIC_KEY_FALSE(rdpmc_never_available_key);
|
|
DECLARE_STATIC_KEY_FALSE(rdpmc_always_available_key);
|
|
void cr4_update_pce(void *ignored);
|
|
#endif
|
|
|
|
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
|
/*
|
|
* ldt_structs can be allocated, used, and freed, but they are never
|
|
* modified while live.
|
|
*/
|
|
struct ldt_struct {
|
|
/*
|
|
* Xen requires page-aligned LDTs with special permissions. This is
|
|
* needed to prevent us from installing evil descriptors such as
|
|
* call gates. On native, we could merge the ldt_struct and LDT
|
|
* allocations, but it's not worth trying to optimize.
|
|
*/
|
|
struct desc_struct *entries;
|
|
unsigned int nr_entries;
|
|
|
|
/*
|
|
* If PTI is in use, then the entries array is not mapped while we're
|
|
* in user mode. The whole array will be aliased at the addressed
|
|
* given by ldt_slot_va(slot). We use two slots so that we can allocate
|
|
* and map, and enable a new LDT without invalidating the mapping
|
|
* of an older, still-in-use LDT.
|
|
*
|
|
* slot will be -1 if this LDT doesn't have an alias mapping.
|
|
*/
|
|
int slot;
|
|
};
|
|
|
|
/*
|
|
* Used for LDT copy/destruction.
|
|
*/
|
|
static inline void init_new_context_ldt(struct mm_struct *mm)
|
|
{
|
|
mm->context.ldt = NULL;
|
|
init_rwsem(&mm->context.ldt_usr_sem);
|
|
}
|
|
int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm);
|
|
void destroy_context_ldt(struct mm_struct *mm);
|
|
void ldt_arch_exit_mmap(struct mm_struct *mm);
|
|
#else /* CONFIG_MODIFY_LDT_SYSCALL */
|
|
static inline void init_new_context_ldt(struct mm_struct *mm) { }
|
|
static inline int ldt_dup_context(struct mm_struct *oldmm,
|
|
struct mm_struct *mm)
|
|
{
|
|
return 0;
|
|
}
|
|
static inline void destroy_context_ldt(struct mm_struct *mm) { }
|
|
static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { }
|
|
#endif
|
|
|
|
#ifdef CONFIG_MODIFY_LDT_SYSCALL
|
|
extern void load_mm_ldt(struct mm_struct *mm);
|
|
extern void switch_ldt(struct mm_struct *prev, struct mm_struct *next);
|
|
#else
|
|
static inline void load_mm_ldt(struct mm_struct *mm)
|
|
{
|
|
clear_LDT();
|
|
}
|
|
static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
|
|
{
|
|
DEBUG_LOCKS_WARN_ON(preemptible());
|
|
}
|
|
#endif
|
|
|
|
#ifdef CONFIG_ADDRESS_MASKING
|
|
static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
|
|
{
|
|
return mm->context.lam_cr3_mask;
|
|
}
|
|
|
|
static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
|
|
{
|
|
mm->context.lam_cr3_mask = oldmm->context.lam_cr3_mask;
|
|
mm->context.untag_mask = oldmm->context.untag_mask;
|
|
}
|
|
|
|
#define mm_untag_mask mm_untag_mask
|
|
static inline unsigned long mm_untag_mask(struct mm_struct *mm)
|
|
{
|
|
return mm->context.untag_mask;
|
|
}
|
|
|
|
static inline void mm_reset_untag_mask(struct mm_struct *mm)
|
|
{
|
|
mm->context.untag_mask = -1UL;
|
|
}
|
|
|
|
#define arch_pgtable_dma_compat arch_pgtable_dma_compat
|
|
static inline bool arch_pgtable_dma_compat(struct mm_struct *mm)
|
|
{
|
|
return !mm_lam_cr3_mask(mm) ||
|
|
test_bit(MM_CONTEXT_FORCE_TAGGED_SVA, &mm->context.flags);
|
|
}
|
|
#else
|
|
|
|
static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
|
|
{
|
|
return 0;
|
|
}
|
|
|
|
static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)
|
|
{
|
|
}
|
|
|
|
static inline void mm_reset_untag_mask(struct mm_struct *mm)
|
|
{
|
|
}
|
|
#endif
|
|
|
|
#define enter_lazy_tlb enter_lazy_tlb
|
|
extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk);
|
|
|
|
/*
|
|
* Init a new mm. Used on mm copies, like at fork()
|
|
* and on mm's that are brand-new, like at execve().
|
|
*/
|
|
#define init_new_context init_new_context
|
|
static inline int init_new_context(struct task_struct *tsk,
|
|
struct mm_struct *mm)
|
|
{
|
|
mutex_init(&mm->context.lock);
|
|
|
|
mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id);
|
|
atomic64_set(&mm->context.tlb_gen, 0);
|
|
|
|
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
|
|
if (cpu_feature_enabled(X86_FEATURE_OSPKE)) {
|
|
/* pkey 0 is the default and allocated implicitly */
|
|
mm->context.pkey_allocation_map = 0x1;
|
|
/* -1 means unallocated or invalid */
|
|
mm->context.execute_only_pkey = -1;
|
|
}
|
|
#endif
|
|
mm_reset_untag_mask(mm);
|
|
init_new_context_ldt(mm);
|
|
return 0;
|
|
}
|
|
|
|
#define destroy_context destroy_context
|
|
static inline void destroy_context(struct mm_struct *mm)
|
|
{
|
|
destroy_context_ldt(mm);
|
|
}
|
|
|
|
extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
|
|
struct task_struct *tsk);
|
|
|
|
extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|
struct task_struct *tsk);
|
|
#define switch_mm_irqs_off switch_mm_irqs_off
|
|
|
|
#define activate_mm(prev, next) \
|
|
do { \
|
|
paravirt_enter_mmap(next); \
|
|
switch_mm((prev), (next), NULL); \
|
|
} while (0);
|
|
|
|
#ifdef CONFIG_X86_32
|
|
#define deactivate_mm(tsk, mm) \
|
|
do { \
|
|
loadsegment(gs, 0); \
|
|
} while (0)
|
|
#else
|
|
#define deactivate_mm(tsk, mm) \
|
|
do { \
|
|
if (!tsk->vfork_done) \
|
|
shstk_free(tsk); \
|
|
load_gs_index(0); \
|
|
loadsegment(fs, 0); \
|
|
} while (0)
|
|
#endif
|
|
|
|
static inline void arch_dup_pkeys(struct mm_struct *oldmm,
|
|
struct mm_struct *mm)
|
|
{
|
|
#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
|
|
if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
|
|
return;
|
|
|
|
/* Duplicate the oldmm pkey state in mm: */
|
|
mm->context.pkey_allocation_map = oldmm->context.pkey_allocation_map;
|
|
mm->context.execute_only_pkey = oldmm->context.execute_only_pkey;
|
|
#endif
|
|
}
|
|
|
|
static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
|
|
{
|
|
arch_dup_pkeys(oldmm, mm);
|
|
paravirt_enter_mmap(mm);
|
|
dup_lam(oldmm, mm);
|
|
return ldt_dup_context(oldmm, mm);
|
|
}
|
|
|
|
static inline void arch_exit_mmap(struct mm_struct *mm)
|
|
{
|
|
paravirt_arch_exit_mmap(mm);
|
|
ldt_arch_exit_mmap(mm);
|
|
}
|
|
|
|
#ifdef CONFIG_X86_64
|
|
static inline bool is_64bit_mm(struct mm_struct *mm)
|
|
{
|
|
return !IS_ENABLED(CONFIG_IA32_EMULATION) ||
|
|
!test_bit(MM_CONTEXT_UPROBE_IA32, &mm->context.flags);
|
|
}
|
|
#else
|
|
static inline bool is_64bit_mm(struct mm_struct *mm)
|
|
{
|
|
return false;
|
|
}
|
|
#endif
|
|
|
|
static inline void arch_unmap(struct mm_struct *mm, unsigned long start,
|
|
unsigned long end)
|
|
{
|
|
}
|
|
|
|
/*
|
|
* We only want to enforce protection keys on the current process
|
|
* because we effectively have no access to PKRU for other
|
|
* processes or any way to tell *which * PKRU in a threaded
|
|
* process we could use.
|
|
*
|
|
* So do not enforce things if the VMA is not from the current
|
|
* mm, or if we are in a kernel thread.
|
|
*/
|
|
static inline bool arch_vma_access_permitted(struct vm_area_struct *vma,
|
|
bool write, bool execute, bool foreign)
|
|
{
|
|
/* pkeys never affect instruction fetches */
|
|
if (execute)
|
|
return true;
|
|
/* allow access if the VMA is not one from this process */
|
|
if (foreign || vma_is_foreign(vma))
|
|
return true;
|
|
return __pkru_allows_pkey(vma_pkey(vma), write);
|
|
}
|
|
|
|
unsigned long __get_current_cr3_fast(void);
|
|
|
|
#include <asm-generic/mmu_context.h>
|
|
|
|
#endif /* _ASM_X86_MMU_CONTEXT_H */
|