diff --git a/Makefile b/Makefile index 80b8671d5c46f7..714163aed0b633 100644 --- a/Makefile +++ b/Makefile @@ -612,6 +612,12 @@ endif # $(dot-config) # Defaults to vmlinux, but the arch makefile usually adds further targets all: vmlinux +# force no-pie for distro compilers that enable pie by default +KBUILD_CFLAGS += $(call cc-option, -fno-pie) +KBUILD_CFLAGS += $(call cc-option, -no-pie) +KBUILD_AFLAGS += $(call cc-option, -fno-pie) +KBUILD_CPPFLAGS += $(call cc-option, -fno-pie) + # The arch Makefile can set ARCH_{CPP,A,C}FLAGS to override the default # values of the respective KBUILD_* variables ARCH_CPPFLAGS := diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index dc802306045653..5313ec9ac57ed1 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1276,6 +1276,16 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (error_code & PF_INSTR) flags |= FAULT_FLAG_INSTRUCTION; + if (error_code & PF_USER) { + fault = handle_speculative_fault(mm, address, + flags & ~FAULT_FLAG_ALLOW_RETRY); + + if (fault & VM_FAULT_RETRY) + goto retry; + + goto done; + } + /* * When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in @@ -1379,7 +1389,15 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, return; } + if (unlikely(fault & VM_FAULT_RETRY)) { + if (fatal_signal_pending(current)) + return; + + goto done; + } + up_read(&mm->mmap_sem); +done: if (unlikely(fault & VM_FAULT_ERROR)) { mm_fault_error(regs, error_code, address, vma, fault); return; diff --git a/include/linux/mm.h b/include/linux/mm.h index ef815b9cd42696..6d4285c0df65f8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -280,6 +280,7 @@ extern pgprot_t protection_map[16]; #define FAULT_FLAG_USER 0x40 /* The fault originated in userspace */ #define FAULT_FLAG_REMOTE 0x80 /* faulting for non current tsk/mm */ #define FAULT_FLAG_INSTRUCTION 0x100 /* The fault was during an instruction fetch */ +#define FAULT_FLAG_SPECULATIVE 0x200 /* Speculative fault, not holding mmap_sem */ /* * vm_fault is filled by the the pagefault handler and passed to the vma's @@ -319,6 +320,7 @@ struct fault_env { struct vm_area_struct *vma; /* Target VMA */ unsigned long address; /* Faulting virtual address */ unsigned int flags; /* FAULT_FLAG_xxx flags */ + unsigned int sequence; pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ @@ -1257,6 +1259,8 @@ int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags); +extern int handle_speculative_fault(struct mm_struct *mm, + unsigned long address, unsigned int flags); extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 903200f4ec41ce..eac866b0987ffb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -358,6 +358,8 @@ struct vm_area_struct { struct mempolicy *vm_policy; /* NUMA policy for the VMA */ #endif struct vm_userfaultfd_ctx vm_userfaultfd_ctx; + seqcount_t vm_sequence; + struct rcu_head vm_rcu_head; }; struct core_thread { @@ -396,6 +398,7 @@ struct kioctx_table; struct mm_struct { struct vm_area_struct *mmap; /* list of VMAs */ struct rb_root mm_rb; + seqlock_t mm_seq; u32 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, diff --git a/kernel/fork.c b/kernel/fork.c index beb31725f7e274..a15f5fdf129c2d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -603,6 +603,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) mm->mmap = NULL; mm->mm_rb = RB_ROOT; mm->vmacache_seqnum = 0; + seqlock_init(&mm->mm_seq); atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); init_rwsem(&mm->mmap_sem); diff --git a/mm/init-mm.c b/mm/init-mm.c index a56a851908d245..5ef625bbb33428 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -15,6 +15,7 @@ struct mm_struct init_mm = { .mm_rb = RB_ROOT, + .mm_seq = __SEQLOCK_UNLOCKED(init_mm.mm_seq), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), diff --git a/mm/internal.h b/mm/internal.h index 1501304f87a41a..2f6c700e237534 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -38,6 +38,24 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte); +extern struct srcu_struct vma_srcu; + +extern struct vm_area_struct *find_vma_srcu(struct mm_struct *mm, unsigned long addr); + +static inline bool vma_is_dead(struct vm_area_struct *vma, unsigned int sequence) +{ + int ret = RB_EMPTY_NODE(&vma->vm_rb); + unsigned seq = ACCESS_ONCE(vma->vm_sequence.sequence); + + /* + * Matches both the wmb in write_seqlock_{begin,end}() and + * the wmb in vma_rb_erase(). + */ + smp_rmb(); + + return ret || seq != sequence; +} + void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/memory.c b/mm/memory.c index 793fe0f9841c09..1c06b45c609713 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1290,6 +1290,7 @@ void unmap_page_range(struct mmu_gather *tlb, unsigned long next; BUG_ON(addr >= end); + write_seqcount_begin(&vma->vm_sequence); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { @@ -1299,6 +1300,7 @@ void unmap_page_range(struct mmu_gather *tlb, next = zap_pud_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); + write_seqcount_end(&vma->vm_sequence); } @@ -1962,30 +1964,6 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, } EXPORT_SYMBOL_GPL(apply_to_page_range); -/* - * handle_pte_fault chooses page fault handler according to an entry which was - * read non-atomically. Before making any commitment, on those architectures - * or configurations (e.g. i386 with PAE) which might give a mix of unmatched - * parts, do_swap_page must check under lock before unmapping the pte and - * proceeding (but do_wp_page is only called after already making such a check; - * and do_anonymous_page can safely check later on). - */ -static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, - pte_t *page_table, pte_t orig_pte) -{ - int same = 1; -#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) - if (sizeof(pte_t) > sizeof(unsigned long)) { - spinlock_t *ptl = pte_lockptr(mm, pmd); - spin_lock(ptl); - same = pte_same(*page_table, orig_pte); - spin_unlock(ptl); - } -#endif - pte_unmap(page_table); - return same; -} - static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { debug_dma_assert_idle(src); @@ -2119,6 +2097,70 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, return VM_FAULT_WRITE; } +static bool pte_spinlock(struct fault_env *fe) +{ + bool ret = false; + + /* Check if vma is still valid */ + if (!(fe->flags & FAULT_FLAG_SPECULATIVE)) { + fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); + spin_lock(fe->ptl); + return true; + } + + local_irq_disable(); + if (vma_is_dead(fe->vma, fe->sequence)) + goto out; + + fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); + spin_lock(fe->ptl); + + if (vma_is_dead(fe->vma, fe->sequence)) { + spin_unlock(fe->ptl); + goto out; + } + + ret = true; +out: + local_irq_enable(); + return ret; +} + +static bool pte_map_lock(struct fault_env *fe) +{ + bool ret = false; + + if (!(fe->flags & FAULT_FLAG_SPECULATIVE)) { + fe->pte = pte_offset_map_lock(fe->vma->vm_mm, fe->pmd, + fe->address, &fe->ptl); + return true; + } + + /* + * The first vma_is_dead() guarantees the page-tables are still valid, + * having IRQs disabled ensures they stay around, hence the second + * vma_is_dead() to make sure they are still valid once we've got the + * lock. After that a concurrent zap_pte_range() will block on the PTL + * and thus we're safe. + */ + local_irq_disable(); + if (vma_is_dead(fe->vma, fe->sequence)) + goto out; + + fe->pte = pte_offset_map_lock(fe->vma->vm_mm, fe->pmd, + fe->address, &fe->ptl); + + if (vma_is_dead(fe->vma, fe->sequence)) { + pte_unmap_unlock(fe->pte, fe->ptl); + goto out; + } + + ret = true; +out: + local_irq_enable(); + return ret; +} + /* * Handle the case of a page which we actually need to copy to a new page. * @@ -2146,6 +2188,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, const unsigned long mmun_start = fe->address & PAGE_MASK; const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; + int ret = VM_FAULT_OOM; if (unlikely(anon_vma_prepare(vma))) goto oom; @@ -2172,7 +2215,11 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, /* * Re-check the pte - we dropped the lock */ - fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); + if (!pte_map_lock(fe)) { + mem_cgroup_cancel_charge(new_page, memcg, false); + ret = VM_FAULT_RETRY; + goto oom_free_new; + } if (likely(pte_same(*fe->pte, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { @@ -2260,7 +2307,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, oom: if (old_page) put_page(old_page); - return VM_FAULT_OOM; + return ret; } /* @@ -2285,8 +2332,12 @@ static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); if (ret & VM_FAULT_ERROR) return ret; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + + if (!pte_map_lock(fe)) { + ret |= VM_FAULT_RETRY; + return ret; + } + /* * We might have raced with another page fault while we * released the pte_offset_map_lock. @@ -2324,8 +2375,11 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, * they did, we just return, as we can count on the * MMU to tell us if they didn't also make it writable. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + if (!pte_map_lock(fe)) { + unlock_page(old_page); + put_page(old_page); + return VM_FAULT_RETRY; + } if (!pte_same(*fe->pte, orig_pte)) { unlock_page(old_page); pte_unmap_unlock(fe->pte, fe->ptl); @@ -2389,8 +2443,11 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) get_page(old_page); pte_unmap_unlock(fe->pte, fe->ptl); lock_page(old_page); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); + if (!pte_map_lock(fe)) { + unlock_page(old_page); + put_page(old_page); + return VM_FAULT_RETRY; + } if (!pte_same(*fe->pte, orig_pte)) { unlock_page(old_page); pte_unmap_unlock(fe->pte, fe->ptl); @@ -2524,12 +2581,10 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) int exclusive = 0; int ret = 0; - if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) - goto out; - entry = pte_to_swp_entry(orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { + /* XXX fe->pmd might be dead */ migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; @@ -2549,8 +2604,10 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) * Back out if somebody else faulted in this pte * while we released the pte lock. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); + if (!pte_map_lock(fe)) { + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + return VM_FAULT_RETRY; + } if (likely(pte_same(*fe->pte, orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); @@ -2606,8 +2663,11 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) /* * Back out if somebody else already faulted in this pte. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + if (!pte_map_lock(fe)) { + ret = VM_FAULT_RETRY; + mem_cgroup_cancel_charge(page, memcg, false); + goto out_page; + } if (unlikely(!pte_same(*fe->pte, orig_pte))) goto out_nomap; @@ -2739,6 +2799,7 @@ static int do_anonymous_page(struct fault_env *fe) struct mem_cgroup *memcg; struct page *page; pte_t entry; + int ret = 0; /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) @@ -2770,8 +2831,8 @@ static int do_anonymous_page(struct fault_env *fe) !mm_forbids_zeropage(vma->vm_mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), vma->vm_page_prot)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + if (!pte_map_lock(fe)) + return VM_FAULT_RETRY; if (!pte_none(*fe->pte)) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ @@ -2803,8 +2864,12 @@ static int do_anonymous_page(struct fault_env *fe) if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + if (!pte_map_lock(fe)) { + /* XXX: should be factorized */ + mem_cgroup_cancel_charge(page, memcg, false); + put_page(page); + return VM_FAULT_RETRY; + } if (!pte_none(*fe->pte)) goto release; @@ -2827,7 +2892,7 @@ static int do_anonymous_page(struct fault_env *fe) update_mmu_cache(vma, fe->address, fe->pte); unlock: pte_unmap_unlock(fe->pte, fe->ptl); - return 0; + return ret; release: mem_cgroup_cancel_charge(page, memcg, false); put_page(page); @@ -2869,7 +2934,7 @@ static int __do_fault(struct fault_env *fe, pgoff_t pgoff, if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); put_page(vmf.page); - return VM_FAULT_HWPOISON; + return ret | VM_FAULT_HWPOISON; } if (unlikely(!(ret & VM_FAULT_LOCKED))) @@ -2916,8 +2981,9 @@ static int pte_alloc_one_map(struct fault_env *fe) if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) return VM_FAULT_NOPAGE; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + if (!pte_map_lock(fe)) + return VM_FAULT_RETRY; + return 0; } @@ -3179,6 +3245,7 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { + /* XXX: is a call to pte_map_lock(fe) required here ? */ ret = do_fault_around(fe, pgoff); if (ret) return ret; @@ -3360,8 +3427,8 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) * page table entry is not accessible, so there would be no * concurrent hardware modifications to the PTE. */ - fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); + if (!pte_spinlock(fe)) + return VM_FAULT_RETRY; if (unlikely(!pte_same(*fe->pte, pte))) { pte_unmap_unlock(fe->pte, fe->ptl); goto out; @@ -3529,8 +3596,8 @@ static int handle_pte_fault(struct fault_env *fe) if (pte_protnone(entry) && vma_is_accessible(fe->vma)) return do_numa_page(fe, entry); - fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); + if (!pte_spinlock(fe)) + return VM_FAULT_RETRY; if (unlikely(!pte_same(*fe->pte, entry))) goto unlock; if (fe->flags & FAULT_FLAG_WRITE) { @@ -3610,6 +3677,94 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, return handle_pte_fault(&fe); } +int handle_speculative_fault(struct mm_struct *mm, unsigned long address, unsigned int flags) +{ + struct fault_env fe = { + .address = address, + .flags = flags | FAULT_FLAG_SPECULATIVE, + }; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + int dead, seq, idx, ret = VM_FAULT_RETRY; + struct vm_area_struct *vma; + + idx = srcu_read_lock(&vma_srcu); + vma = find_vma_srcu(mm, address); + if (!vma) + goto unlock; + + /* + * Validate the VMA found by the lockless lookup. + */ + dead = RB_EMPTY_NODE(&vma->vm_rb); + seq = raw_read_seqcount(&vma->vm_sequence); /* rmb <-> seqlock,vma_rb_erase() */ + if ((seq & 1) || dead) /* XXX wait for !&1 instead? */ + goto unlock; + + if (address < vma->vm_start || vma->vm_end <= address) + goto unlock; + + /* + * We need to re-validate the VMA after checking the bounds, otherwise + * we might have a false positive on the bounds. + */ + if (read_seqcount_retry(&vma->vm_sequence, seq)) + goto unlock; + + /* + * Do a speculative lookup of the PTE entry. + */ + local_irq_disable(); + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto out_walk; + + pud = pud_offset(pgd, address); + if (pud_none(*pud) || unlikely(pud_bad(*pud))) + goto out_walk; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) + goto out_walk; + + /* + * The above does not allocate/instantiate page-tables because doing so + * would lead to the possibility of instantiating page-tables after + * free_pgtables() -- and consequently leaking them. + * + * The result is that we take at least one !speculative fault per PMD + * in order to instantiate it. + * + * XXX try and fix that.. should be possible somehow. + */ + + if (pmd_huge(*pmd)) /* XXX no huge support */ + goto out_walk; + + fe.vma = vma; + fe.pmd = pmd; + fe.sequence = seq; + +#if 0 +#warning This is done in handle_pte_fault()... + pte = pte_offset_map(pmd, address); + fe.entry = ACCESS_ONCE(pte); /* XXX gup_get_pte() */ + pte_unmap(pte); +#endif + local_irq_enable(); + + ret = handle_pte_fault(&fe); + +unlock: + srcu_read_unlock(&vma_srcu, idx); + return ret; + +out_walk: + local_irq_enable(); + goto unlock; +} + /* * By the time we get here, we already hold the mm semaphore * diff --git a/mm/mmap.c b/mm/mmap.c index ca9d91bca0d6c6..fb769f4243d62c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -152,6 +152,23 @@ void unlink_file_vma(struct vm_area_struct *vma) } } +DEFINE_SRCU(vma_srcu); + +static void __free_vma(struct rcu_head *head) +{ + struct vm_area_struct *vma = + container_of(head, struct vm_area_struct, vm_rcu_head); + + if (vma->vm_file) + fput(vma->vm_file); + kmem_cache_free(vm_area_cachep, vma); +} + +static void free_vma(struct vm_area_struct *vma) +{ + call_srcu(&vma_srcu, &vma->vm_rcu_head, __free_vma); +} + /* * Close a vm structure and free it, returning the next. */ @@ -162,10 +179,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); - if (vma->vm_file) - fput(vma->vm_file); mpol_put(vma_policy(vma)); - kmem_cache_free(vm_area_cachep, vma); + free_vma(vma); return next; } @@ -386,17 +401,19 @@ static void vma_gap_update(struct vm_area_struct *vma) vma_gap_callbacks_propagate(&vma->vm_rb, NULL); } -static inline void vma_rb_insert(struct vm_area_struct *vma, - struct rb_root *root) +static inline void vma_rb_insert(struct vm_area_struct *vma, struct mm_struct *mm) { + struct rb_root *root = &mm->mm_rb; + /* All rb_subtree_gap values must be consistent prior to insertion */ validate_mm_rb(root, NULL); rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); } -static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) +static void vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm) { + struct rb_root *root = &mm->mm_rb; /* * All rb_subtree_gap values must be consistent prior to erase, * with the possible exception of the vma being erased. @@ -408,7 +425,15 @@ static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) * so make sure we instantiate it only once with our desired * augmented rbtree callbacks. */ + write_seqlock(&mm->mm_seq); rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); + write_sequnlock(&mm->mm_seq); /* wmb */ + + /* + * Ensure the removal is complete before clearing the node. + * Matched by vma_is_dead()/handle_speculative_fault(). + */ + RB_CLEAR_NODE(&vma->vm_rb); } /* @@ -514,6 +539,8 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, else mm->highest_vm_end = vma->vm_end; + seqcount_init(&vma->vm_sequence); + /* * vma->vm_prev wasn't known when we followed the rbtree to find the * correct insertion point for that vma. As a result, we could not @@ -523,10 +550,12 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, * immediately update the gap to the correct value. Finally we * rebalance the rbtree after all augmented values have been set. */ + write_seqlock(&mm->mm_seq); rb_link_node(&vma->vm_rb, rb_parent, rb_link); vma->rb_subtree_gap = 0; vma_gap_update(vma); - vma_rb_insert(vma, &mm->mm_rb); + vma_rb_insert(vma, mm); + write_sequnlock(&mm->mm_seq); } static void __vma_link_file(struct vm_area_struct *vma) @@ -600,7 +629,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct *next; - vma_rb_erase(vma, &mm->mm_rb); + vma_rb_erase(vma, mm); prev->vm_next = next = vma->vm_next; if (next) next->vm_prev = prev; @@ -629,6 +658,10 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, long adjust_next = 0; int remove_next = 0; + write_seqcount_begin(&vma->vm_sequence); + if (next) + write_seqcount_begin_nested(&next->vm_sequence, SINGLE_DEPTH_NESTING); + if (next && !insert) { struct vm_area_struct *exporter = NULL, *importer = NULL; @@ -788,21 +821,21 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (remove_next) { - if (file) { + if (file) uprobe_munmap(next, next->vm_start, next->vm_end); - fput(file); - } if (next->anon_vma) anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); - kmem_cache_free(vm_area_cachep, next); + free_vma(next); /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter * up the code too much to do both in one go. */ + write_seqcount_end(&next->vm_sequence); next = vma->vm_next; + write_seqcount_begin_nested(&next->vm_sequence, SINGLE_DEPTH_NESTING); if (remove_next == 2) { remove_next = 1; end = next->vm_end; @@ -816,6 +849,10 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, if (insert && file) uprobe_mmap(insert); + if (next) + write_seqcount_end(&next->vm_sequence); + write_seqcount_end(&vma->vm_sequence); + validate_mm(mm); return 0; @@ -1937,16 +1974,11 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ -struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +static struct vm_area_struct *__find_vma(struct mm_struct *mm, unsigned long addr) { struct rb_node *rb_node; struct vm_area_struct *vma; - /* Check the cache first. */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; - rb_node = mm->mm_rb.rb_node; while (rb_node) { @@ -1963,13 +1995,40 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) rb_node = rb_node->rb_right; } + return vma; +} + +struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + + /* Check the cache first. */ + vma = vmacache_find(mm, addr); + if (likely(vma)) + return vma; + + vma = __find_vma(mm, addr); if (vma) vmacache_update(addr, vma); return vma; } - EXPORT_SYMBOL(find_vma); +struct vm_area_struct *find_vma_srcu(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma; + unsigned int seq; + + WARN_ON_ONCE(!srcu_read_lock_held(&vma_srcu)); + + do { + seq = read_seqbegin(&mm->mm_seq); + vma = __find_vma(mm, addr); + } while (read_seqretry(&mm->mm_seq, seq)); + + return vma; +} + /* * Same as find_vma, but also return a pointer to the previous VMA in *pprev. */ @@ -2324,7 +2383,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; do { - vma_rb_erase(vma, &mm->mm_rb); + vma_rb_erase(vma, mm); mm->map_count--; tail_vma = vma; vma = vma->vm_next;