Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions include/linux/hugetlb.h
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma,

bool is_hugetlb_entry_migration(pte_t pte);
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma);
void hugetlb_split(struct vm_area_struct *vma, unsigned long addr);

#else /* !CONFIG_HUGETLB_PAGE */

Expand Down Expand Up @@ -464,6 +465,8 @@ static inline vm_fault_t hugetlb_fault(struct mm_struct *mm,

static inline void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) { }

static inline void hugetlb_split(struct vm_area_struct *vma, unsigned long addr) {}

#endif /* !CONFIG_HUGETLB_PAGE */
/*
* hugepages at page global directory. If arch support
Expand Down
3 changes: 3 additions & 0 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -2539,6 +2539,9 @@ static inline bool pgtable_pmd_page_ctor(struct page *page)
if (!pmd_ptlock_init(page))
return false;
__SetPageTable(page);
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
atomic_set(&page->pt_share_count, 0);
#endif
inc_lruvec_page_state(page, NR_PAGETABLE);
return true;
}
Expand Down
3 changes: 3 additions & 0 deletions include/linux/mm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,9 @@ struct page {
union {
struct mm_struct *pt_mm; /* x86 pgds only */
atomic_t pt_frag_refcount; /* powerpc */
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
RH_KABI_BROKEN_INSERT(atomic_t pt_share_count)
#endif
};
#if ALLOC_SPLIT_PTLOCKS
spinlock_t *ptl;
Expand Down
112 changes: 83 additions & 29 deletions mm/hugetlb.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,8 @@ static int hugetlb_acct_memory(struct hstate *h, long delta);
static void hugetlb_vma_lock_free(struct vm_area_struct *vma);
static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma);
static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma);
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start, unsigned long end, bool take_locks);

static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
Expand Down Expand Up @@ -4834,6 +4836,39 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
return 0;
}

void hugetlb_split(struct vm_area_struct *vma, unsigned long addr)
{
/*
* PMD sharing is only possible for PUD_SIZE-aligned address ranges
* in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this
* split, unshare PMDs in the PUD_SIZE interval surrounding addr now.
* This function is called in the middle of a VMA split operation, with
* MM, VMA and rmap all write-locked to prevent concurrent page table
* walks (except hardware and gup_fast()).
*/
mmap_assert_write_locked(vma->vm_mm);
i_mmap_assert_write_locked(vma->vm_file->f_mapping);

if (addr & ~PUD_MASK) {
unsigned long floor = addr & PUD_MASK;
unsigned long ceil = floor + PUD_SIZE;

if (floor >= vma->vm_start && ceil <= vma->vm_end) {
/*
* Locking:
* Use take_locks=false here.
* The file rmap lock is already held.
* The hugetlb VMA lock can't be taken when we already
* hold the file rmap lock, and we don't need it because
* its purpose is to synchronize against concurrent page
* table walks, which are not possible thanks to the
* locks held by our caller.
*/
hugetlb_unshare_pmds(vma, floor, ceil, /* take_locks = */ false);
}
}
}

static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma)
{
return huge_page_size(hstate_vma(vma));
Expand Down Expand Up @@ -4978,18 +5013,13 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
break;
}

/*
* If the pagetables are shared don't copy or take references.
*
* dst_pte == src_pte is the common case of src/dest sharing.
* However, src could have 'unshared' and dst shares with
* another vma. So page_count of ptep page is checked instead
* to reliably determine whether pte is shared.
*/
if (page_count(virt_to_page(dst_pte)) > 1) {
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
/* If the pagetables are shared, there is nothing to do */
if (atomic_read(&virt_to_page(dst_pte)->pt_share_count)) {
addr |= last_addr_mask;
continue;
}
#endif

dst_ptl = huge_pte_lock(h, dst, dst_pte);
src_ptl = huge_pte_lockptr(h, src, src_pte);
Expand Down Expand Up @@ -7058,7 +7088,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
spte = hugetlb_walk(svma, saddr,
vma_mmu_pagesize(svma));
if (spte) {
get_page(virt_to_page(spte));
atomic_inc(&virt_to_page(spte)->pt_share_count);
break;
}
}
Expand All @@ -7073,7 +7103,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
mm_inc_nr_pmds(mm);
} else {
put_page(virt_to_page(spte));
atomic_dec(&virt_to_page(spte)->pt_share_count);
}
spin_unlock(ptl);
out:
Expand All @@ -7085,10 +7115,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* unmap huge page backed by shared pte.
*
* Hugetlb pte page is ref counted at the time of mapping. If pte is shared
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
* Called with page table lock held.
*
* returns: 1 successfully unmapped a shared pte page
Expand All @@ -7097,18 +7123,27 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pte_t *ptep)
{
unsigned long sz = huge_page_size(hstate_vma(vma));
pgd_t *pgd = pgd_offset(mm, addr);
p4d_t *p4d = p4d_offset(pgd, addr);
pud_t *pud = pud_offset(p4d, addr);

i_mmap_assert_write_locked(vma->vm_file->f_mapping);
hugetlb_vma_assert_locked(vma);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
if (sz != PMD_SIZE)
return 0;
if (!atomic_read(&virt_to_page(ptep)->pt_share_count))
return 0;

pud_clear(pud);
put_page(virt_to_page(ptep));
/*
* Once our caller drops the rmap lock, some other process might be
* using this page table as a normal, non-hugetlb page table.
* Wait for pending gup_fast() in other threads to finish before letting
* that happen.
*/
tlb_remove_table_sync_one();
atomic_dec(&virt_to_page(ptep)->pt_share_count);
mm_dec_nr_pmds(mm);
return 1;
}
Expand Down Expand Up @@ -7340,25 +7375,27 @@ void move_hugetlb_state(struct folio *old_folio, struct folio *new_folio, int re
}

/*
* This function will unconditionally remove all the shared pmd pgtable entries
* within the specific vma for a hugetlbfs memory range.
* If @take_locks is false, the caller must ensure that no concurrent page table
* access can happen (except for gup_fast() and hardware page walks).
* If @take_locks is true, we take the hugetlb VMA lock (to lock out things like
* concurrent page fault handling) and the file rmap lock.
*/
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start,
unsigned long end,
bool take_locks)
{
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
struct mmu_notifier_range range;
unsigned long address, start, end;
unsigned long address;
spinlock_t *ptl;
pte_t *ptep;

if (!(vma->vm_flags & VM_MAYSHARE))
return;

start = ALIGN(vma->vm_start, PUD_SIZE);
end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);

if (start >= end)
return;

Expand All @@ -7370,8 +7407,12 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
start, end);
mmu_notifier_invalidate_range_start(&range);
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
if (take_locks) {
hugetlb_vma_lock_write(vma);
i_mmap_lock_write(vma->vm_file->f_mapping);
} else {
i_mmap_assert_write_locked(vma->vm_file->f_mapping);
}
for (address = start; address < end; address += PUD_SIZE) {
ptep = hugetlb_walk(vma, address, sz);
if (!ptep)
Expand All @@ -7381,15 +7422,28 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
spin_unlock(ptl);
}
flush_hugetlb_tlb_range(vma, start, end);
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
if (take_locks) {
i_mmap_unlock_write(vma->vm_file->f_mapping);
hugetlb_vma_unlock_write(vma);
}
/*
* No need to call mmu_notifier_arch_invalidate_secondary_tlbs(), see
* Documentation/mm/mmu_notifier.rst.
*/
mmu_notifier_invalidate_range_end(&range);
}

/*
* This function will unconditionally remove all the shared pmd pgtable entries
* within the specific vma for a hugetlbfs memory range.
*/
void hugetlb_unshare_all_pmds(struct vm_area_struct *vma)
{
hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE),
ALIGN_DOWN(vma->vm_end, PUD_SIZE),
/* take_locks = */ true);
}

#ifdef CONFIG_CMA
static bool cma_reserve_called __initdata;

Expand Down
8 changes: 8 additions & 0 deletions mm/mmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -815,7 +815,15 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
}
}
again:
/*
* Get rid of huge pages and shared page tables straddling the split
* boundary.
*/
vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
if (is_vm_hugetlb_page(orig_vma)) {
hugetlb_split(orig_vma, start);
hugetlb_split(orig_vma, end);
}

if (file) {
mapping = file->f_mapping;
Expand Down