Skip to content

Commit aa39ca6

Browse files
davidhildenbrandakpm00
authored andcommitted
mm/pagewalk: introduce folio_walk_start() + folio_walk_end()
We want to get rid of follow_page(), and have a more reasonable way to just lookup a folio mapped at a certain address, perform some checks while still under PTL, and then only conditionally grab a folio reference if really required. Further, we might want to get rid of some walk_page_range*() users that really only want to temporarily lookup a single folio at a single address. So let's add a new page table walker that does exactly that, similarly to GUP also being able to walk hugetlb VMAs. Add folio_walk_end() as a macro for now: the compiler is not easy to please with the pte_unmap()->kunmap_local(). Note that one difference between follow_page() and get_user_pages(1) is that follow_page() will not trigger faults to get something mapped. So folio_walk is at least currently not a replacement for get_user_pages(1), but could likely be extended/reused to achieve something similar in the future. Link: https://lkml.kernel.org/r/20240802155524.517137-3-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Cc: Alexander Gordeev <agordeev@linux.ibm.com> Cc: Christian Borntraeger <borntraeger@linux.ibm.com> Cc: Claudio Imbrenda <imbrenda@linux.ibm.com> Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Janosch Frank <frankja@linux.ibm.com> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Matthew Wilcox <willy@infradead.org> Cc: Sven Schnelle <svens@linux.ibm.com> Cc: Vasily Gorbik <gor@linux.ibm.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent 3523a37 commit aa39ca6

File tree

2 files changed

+260
-0
lines changed

2 files changed

+260
-0
lines changed

include/linux/pagewalk.h

+58
Original file line numberDiff line numberDiff line change
@@ -130,4 +130,62 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
130130
pgoff_t nr, const struct mm_walk_ops *ops,
131131
void *private);
132132

133+
typedef int __bitwise folio_walk_flags_t;
134+
135+
/*
136+
* Walk migration entries as well. Careful: a large folio might get split
137+
* concurrently.
138+
*/
139+
#define FW_MIGRATION ((__force folio_walk_flags_t)BIT(0))
140+
141+
/* Walk shared zeropages (small + huge) as well. */
142+
#define FW_ZEROPAGE ((__force folio_walk_flags_t)BIT(1))
143+
144+
enum folio_walk_level {
145+
FW_LEVEL_PTE,
146+
FW_LEVEL_PMD,
147+
FW_LEVEL_PUD,
148+
};
149+
150+
/**
151+
* struct folio_walk - folio_walk_start() / folio_walk_end() data
152+
* @page: exact folio page referenced (if applicable)
153+
* @level: page table level identifying the entry type
154+
* @pte: pointer to the page table entry (FW_LEVEL_PTE).
155+
* @pmd: pointer to the page table entry (FW_LEVEL_PMD).
156+
* @pud: pointer to the page table entry (FW_LEVEL_PUD).
157+
* @ptl: pointer to the page table lock.
158+
*
159+
* (see folio_walk_start() documentation for more details)
160+
*/
161+
struct folio_walk {
162+
/* public */
163+
struct page *page;
164+
enum folio_walk_level level;
165+
union {
166+
pte_t *ptep;
167+
pud_t *pudp;
168+
pmd_t *pmdp;
169+
};
170+
union {
171+
pte_t pte;
172+
pud_t pud;
173+
pmd_t pmd;
174+
};
175+
/* private */
176+
struct vm_area_struct *vma;
177+
spinlock_t *ptl;
178+
};
179+
180+
struct folio *folio_walk_start(struct folio_walk *fw,
181+
struct vm_area_struct *vma, unsigned long addr,
182+
folio_walk_flags_t flags);
183+
184+
#define folio_walk_end(__fw, __vma) do { \
185+
spin_unlock((__fw)->ptl); \
186+
if (likely((__fw)->level == FW_LEVEL_PTE)) \
187+
pte_unmap((__fw)->ptep); \
188+
vma_pgtable_walk_end(__vma); \
189+
} while (0)
190+
133191
#endif /* _LINUX_PAGEWALK_H */

mm/pagewalk.c

+202
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
#include <linux/highmem.h>
44
#include <linux/sched.h>
55
#include <linux/hugetlb.h>
6+
#include <linux/swap.h>
7+
#include <linux/swapops.h>
68

79
/*
810
* We want to know the real level where a entry is located ignoring any
@@ -654,3 +656,203 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
654656

655657
return err;
656658
}
659+
660+
/**
661+
* folio_walk_start - walk the page tables to a folio
662+
* @fw: filled with information on success.
663+
* @vma: the VMA.
664+
* @addr: the virtual address to use for the page table walk.
665+
* @flags: flags modifying which folios to walk to.
666+
*
667+
* Walk the page tables using @addr in a given @vma to a mapped folio and
668+
* return the folio, making sure that the page table entry referenced by
669+
* @addr cannot change until folio_walk_end() was called.
670+
*
671+
* As default, this function returns only folios that are not special (e.g., not
672+
* the zeropage) and never returns folios that are supposed to be ignored by the
673+
* VM as documented by vm_normal_page(). If requested, zeropages will be
674+
* returned as well.
675+
*
676+
* As default, this function only considers present page table entries.
677+
* If requested, it will also consider migration entries.
678+
*
679+
* If this function returns NULL it might either indicate "there is nothing" or
680+
* "there is nothing suitable".
681+
*
682+
* On success, @fw is filled and the function returns the folio while the PTL
683+
* is still held and folio_walk_end() must be called to clean up,
684+
* releasing any held locks. The returned folio must *not* be used after the
685+
* call to folio_walk_end(), unless a short-term folio reference is taken before
686+
* that call.
687+
*
688+
* @fw->page will correspond to the page that is effectively referenced by
689+
* @addr. However, for migration entries and shared zeropages @fw->page is
690+
* set to NULL. Note that large folios might be mapped by multiple page table
691+
* entries, and this function will always only lookup a single entry as
692+
* specified by @addr, which might or might not cover more than a single page of
693+
* the returned folio.
694+
*
695+
* This function must *not* be used as a naive replacement for
696+
* get_user_pages() / pin_user_pages(), especially not to perform DMA or
697+
* to carelessly modify page content. This function may *only* be used to grab
698+
* short-term folio references, never to grab long-term folio references.
699+
*
700+
* Using the page table entry pointers in @fw for reading or modifying the
701+
* entry should be avoided where possible: however, there might be valid
702+
* use cases.
703+
*
704+
* WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care.
705+
* For example, PMD page table sharing might require prior unsharing. Also,
706+
* logical hugetlb entries might span multiple physical page table entries,
707+
* which *must* be modified in a single operation (set_huge_pte_at(),
708+
* huge_ptep_set_*, ...). Note that the page table entry stored in @fw might
709+
* not correspond to the first physical entry of a logical hugetlb entry.
710+
*
711+
* The mmap lock must be held in read mode.
712+
*
713+
* Return: folio pointer on success, otherwise NULL.
714+
*/
715+
struct folio *folio_walk_start(struct folio_walk *fw,
716+
struct vm_area_struct *vma, unsigned long addr,
717+
folio_walk_flags_t flags)
718+
{
719+
unsigned long entry_size;
720+
bool expose_page = true;
721+
struct page *page;
722+
pud_t *pudp, pud;
723+
pmd_t *pmdp, pmd;
724+
pte_t *ptep, pte;
725+
spinlock_t *ptl;
726+
pgd_t *pgdp;
727+
p4d_t *p4dp;
728+
729+
mmap_assert_locked(vma->vm_mm);
730+
vma_pgtable_walk_begin(vma);
731+
732+
if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end))
733+
goto not_found;
734+
735+
pgdp = pgd_offset(vma->vm_mm, addr);
736+
if (pgd_none_or_clear_bad(pgdp))
737+
goto not_found;
738+
739+
p4dp = p4d_offset(pgdp, addr);
740+
if (p4d_none_or_clear_bad(p4dp))
741+
goto not_found;
742+
743+
pudp = pud_offset(p4dp, addr);
744+
pud = pudp_get(pudp);
745+
if (pud_none(pud))
746+
goto not_found;
747+
if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) {
748+
ptl = pud_lock(vma->vm_mm, pudp);
749+
pud = pudp_get(pudp);
750+
751+
entry_size = PUD_SIZE;
752+
fw->level = FW_LEVEL_PUD;
753+
fw->pudp = pudp;
754+
fw->pud = pud;
755+
756+
if (!pud_present(pud) || pud_devmap(pud)) {
757+
spin_unlock(ptl);
758+
goto not_found;
759+
} else if (!pud_leaf(pud)) {
760+
spin_unlock(ptl);
761+
goto pmd_table;
762+
}
763+
/*
764+
* TODO: vm_normal_page_pud() will be handy once we want to
765+
* support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs.
766+
*/
767+
page = pud_page(pud);
768+
goto found;
769+
}
770+
771+
pmd_table:
772+
VM_WARN_ON_ONCE(pud_leaf(*pudp));
773+
pmdp = pmd_offset(pudp, addr);
774+
pmd = pmdp_get_lockless(pmdp);
775+
if (pmd_none(pmd))
776+
goto not_found;
777+
if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) {
778+
ptl = pmd_lock(vma->vm_mm, pmdp);
779+
pmd = pmdp_get(pmdp);
780+
781+
entry_size = PMD_SIZE;
782+
fw->level = FW_LEVEL_PMD;
783+
fw->pmdp = pmdp;
784+
fw->pmd = pmd;
785+
786+
if (pmd_none(pmd)) {
787+
spin_unlock(ptl);
788+
goto not_found;
789+
} else if (!pmd_leaf(pmd)) {
790+
spin_unlock(ptl);
791+
goto pte_table;
792+
} else if (pmd_present(pmd)) {
793+
page = vm_normal_page_pmd(vma, addr, pmd);
794+
if (page) {
795+
goto found;
796+
} else if ((flags & FW_ZEROPAGE) &&
797+
is_huge_zero_pmd(pmd)) {
798+
page = pfn_to_page(pmd_pfn(pmd));
799+
expose_page = false;
800+
goto found;
801+
}
802+
} else if ((flags & FW_MIGRATION) &&
803+
is_pmd_migration_entry(pmd)) {
804+
swp_entry_t entry = pmd_to_swp_entry(pmd);
805+
806+
page = pfn_swap_entry_to_page(entry);
807+
expose_page = false;
808+
goto found;
809+
}
810+
spin_unlock(ptl);
811+
goto not_found;
812+
}
813+
814+
pte_table:
815+
VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp)));
816+
ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
817+
if (!ptep)
818+
goto not_found;
819+
pte = ptep_get(ptep);
820+
821+
entry_size = PAGE_SIZE;
822+
fw->level = FW_LEVEL_PTE;
823+
fw->ptep = ptep;
824+
fw->pte = pte;
825+
826+
if (pte_present(pte)) {
827+
page = vm_normal_page(vma, addr, pte);
828+
if (page)
829+
goto found;
830+
if ((flags & FW_ZEROPAGE) &&
831+
is_zero_pfn(pte_pfn(pte))) {
832+
page = pfn_to_page(pte_pfn(pte));
833+
expose_page = false;
834+
goto found;
835+
}
836+
} else if (!pte_none(pte)) {
837+
swp_entry_t entry = pte_to_swp_entry(pte);
838+
839+
if ((flags & FW_MIGRATION) &&
840+
is_migration_entry(entry)) {
841+
page = pfn_swap_entry_to_page(entry);
842+
expose_page = false;
843+
goto found;
844+
}
845+
}
846+
pte_unmap_unlock(ptep, ptl);
847+
not_found:
848+
vma_pgtable_walk_end(vma);
849+
return NULL;
850+
found:
851+
if (expose_page)
852+
/* Note: Offset from the mapped page, not the folio start. */
853+
fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT);
854+
else
855+
fw->page = NULL;
856+
fw->ptl = ptl;
857+
return page_folio(page);
858+
}

0 commit comments

Comments
 (0)