|
3 | 3 | #include <linux/highmem.h>
|
4 | 4 | #include <linux/sched.h>
|
5 | 5 | #include <linux/hugetlb.h>
|
| 6 | +#include <linux/swap.h> |
| 7 | +#include <linux/swapops.h> |
6 | 8 |
|
7 | 9 | /*
|
8 | 10 | * We want to know the real level where a entry is located ignoring any
|
@@ -654,3 +656,203 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
|
654 | 656 |
|
655 | 657 | return err;
|
656 | 658 | }
|
| 659 | + |
| 660 | +/** |
| 661 | + * folio_walk_start - walk the page tables to a folio |
| 662 | + * @fw: filled with information on success. |
| 663 | + * @vma: the VMA. |
| 664 | + * @addr: the virtual address to use for the page table walk. |
| 665 | + * @flags: flags modifying which folios to walk to. |
| 666 | + * |
| 667 | + * Walk the page tables using @addr in a given @vma to a mapped folio and |
| 668 | + * return the folio, making sure that the page table entry referenced by |
| 669 | + * @addr cannot change until folio_walk_end() was called. |
| 670 | + * |
| 671 | + * As default, this function returns only folios that are not special (e.g., not |
| 672 | + * the zeropage) and never returns folios that are supposed to be ignored by the |
| 673 | + * VM as documented by vm_normal_page(). If requested, zeropages will be |
| 674 | + * returned as well. |
| 675 | + * |
| 676 | + * As default, this function only considers present page table entries. |
| 677 | + * If requested, it will also consider migration entries. |
| 678 | + * |
| 679 | + * If this function returns NULL it might either indicate "there is nothing" or |
| 680 | + * "there is nothing suitable". |
| 681 | + * |
| 682 | + * On success, @fw is filled and the function returns the folio while the PTL |
| 683 | + * is still held and folio_walk_end() must be called to clean up, |
| 684 | + * releasing any held locks. The returned folio must *not* be used after the |
| 685 | + * call to folio_walk_end(), unless a short-term folio reference is taken before |
| 686 | + * that call. |
| 687 | + * |
| 688 | + * @fw->page will correspond to the page that is effectively referenced by |
| 689 | + * @addr. However, for migration entries and shared zeropages @fw->page is |
| 690 | + * set to NULL. Note that large folios might be mapped by multiple page table |
| 691 | + * entries, and this function will always only lookup a single entry as |
| 692 | + * specified by @addr, which might or might not cover more than a single page of |
| 693 | + * the returned folio. |
| 694 | + * |
| 695 | + * This function must *not* be used as a naive replacement for |
| 696 | + * get_user_pages() / pin_user_pages(), especially not to perform DMA or |
| 697 | + * to carelessly modify page content. This function may *only* be used to grab |
| 698 | + * short-term folio references, never to grab long-term folio references. |
| 699 | + * |
| 700 | + * Using the page table entry pointers in @fw for reading or modifying the |
| 701 | + * entry should be avoided where possible: however, there might be valid |
| 702 | + * use cases. |
| 703 | + * |
| 704 | + * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care. |
| 705 | + * For example, PMD page table sharing might require prior unsharing. Also, |
| 706 | + * logical hugetlb entries might span multiple physical page table entries, |
| 707 | + * which *must* be modified in a single operation (set_huge_pte_at(), |
| 708 | + * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might |
| 709 | + * not correspond to the first physical entry of a logical hugetlb entry. |
| 710 | + * |
| 711 | + * The mmap lock must be held in read mode. |
| 712 | + * |
| 713 | + * Return: folio pointer on success, otherwise NULL. |
| 714 | + */ |
| 715 | +struct folio *folio_walk_start(struct folio_walk *fw, |
| 716 | + struct vm_area_struct *vma, unsigned long addr, |
| 717 | + folio_walk_flags_t flags) |
| 718 | +{ |
| 719 | + unsigned long entry_size; |
| 720 | + bool expose_page = true; |
| 721 | + struct page *page; |
| 722 | + pud_t *pudp, pud; |
| 723 | + pmd_t *pmdp, pmd; |
| 724 | + pte_t *ptep, pte; |
| 725 | + spinlock_t *ptl; |
| 726 | + pgd_t *pgdp; |
| 727 | + p4d_t *p4dp; |
| 728 | + |
| 729 | + mmap_assert_locked(vma->vm_mm); |
| 730 | + vma_pgtable_walk_begin(vma); |
| 731 | + |
| 732 | + if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end)) |
| 733 | + goto not_found; |
| 734 | + |
| 735 | + pgdp = pgd_offset(vma->vm_mm, addr); |
| 736 | + if (pgd_none_or_clear_bad(pgdp)) |
| 737 | + goto not_found; |
| 738 | + |
| 739 | + p4dp = p4d_offset(pgdp, addr); |
| 740 | + if (p4d_none_or_clear_bad(p4dp)) |
| 741 | + goto not_found; |
| 742 | + |
| 743 | + pudp = pud_offset(p4dp, addr); |
| 744 | + pud = pudp_get(pudp); |
| 745 | + if (pud_none(pud)) |
| 746 | + goto not_found; |
| 747 | + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) { |
| 748 | + ptl = pud_lock(vma->vm_mm, pudp); |
| 749 | + pud = pudp_get(pudp); |
| 750 | + |
| 751 | + entry_size = PUD_SIZE; |
| 752 | + fw->level = FW_LEVEL_PUD; |
| 753 | + fw->pudp = pudp; |
| 754 | + fw->pud = pud; |
| 755 | + |
| 756 | + if (!pud_present(pud) || pud_devmap(pud)) { |
| 757 | + spin_unlock(ptl); |
| 758 | + goto not_found; |
| 759 | + } else if (!pud_leaf(pud)) { |
| 760 | + spin_unlock(ptl); |
| 761 | + goto pmd_table; |
| 762 | + } |
| 763 | + /* |
| 764 | + * TODO: vm_normal_page_pud() will be handy once we want to |
| 765 | + * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs. |
| 766 | + */ |
| 767 | + page = pud_page(pud); |
| 768 | + goto found; |
| 769 | + } |
| 770 | + |
| 771 | +pmd_table: |
| 772 | + VM_WARN_ON_ONCE(pud_leaf(*pudp)); |
| 773 | + pmdp = pmd_offset(pudp, addr); |
| 774 | + pmd = pmdp_get_lockless(pmdp); |
| 775 | + if (pmd_none(pmd)) |
| 776 | + goto not_found; |
| 777 | + if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) { |
| 778 | + ptl = pmd_lock(vma->vm_mm, pmdp); |
| 779 | + pmd = pmdp_get(pmdp); |
| 780 | + |
| 781 | + entry_size = PMD_SIZE; |
| 782 | + fw->level = FW_LEVEL_PMD; |
| 783 | + fw->pmdp = pmdp; |
| 784 | + fw->pmd = pmd; |
| 785 | + |
| 786 | + if (pmd_none(pmd)) { |
| 787 | + spin_unlock(ptl); |
| 788 | + goto not_found; |
| 789 | + } else if (!pmd_leaf(pmd)) { |
| 790 | + spin_unlock(ptl); |
| 791 | + goto pte_table; |
| 792 | + } else if (pmd_present(pmd)) { |
| 793 | + page = vm_normal_page_pmd(vma, addr, pmd); |
| 794 | + if (page) { |
| 795 | + goto found; |
| 796 | + } else if ((flags & FW_ZEROPAGE) && |
| 797 | + is_huge_zero_pmd(pmd)) { |
| 798 | + page = pfn_to_page(pmd_pfn(pmd)); |
| 799 | + expose_page = false; |
| 800 | + goto found; |
| 801 | + } |
| 802 | + } else if ((flags & FW_MIGRATION) && |
| 803 | + is_pmd_migration_entry(pmd)) { |
| 804 | + swp_entry_t entry = pmd_to_swp_entry(pmd); |
| 805 | + |
| 806 | + page = pfn_swap_entry_to_page(entry); |
| 807 | + expose_page = false; |
| 808 | + goto found; |
| 809 | + } |
| 810 | + spin_unlock(ptl); |
| 811 | + goto not_found; |
| 812 | + } |
| 813 | + |
| 814 | +pte_table: |
| 815 | + VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp))); |
| 816 | + ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl); |
| 817 | + if (!ptep) |
| 818 | + goto not_found; |
| 819 | + pte = ptep_get(ptep); |
| 820 | + |
| 821 | + entry_size = PAGE_SIZE; |
| 822 | + fw->level = FW_LEVEL_PTE; |
| 823 | + fw->ptep = ptep; |
| 824 | + fw->pte = pte; |
| 825 | + |
| 826 | + if (pte_present(pte)) { |
| 827 | + page = vm_normal_page(vma, addr, pte); |
| 828 | + if (page) |
| 829 | + goto found; |
| 830 | + if ((flags & FW_ZEROPAGE) && |
| 831 | + is_zero_pfn(pte_pfn(pte))) { |
| 832 | + page = pfn_to_page(pte_pfn(pte)); |
| 833 | + expose_page = false; |
| 834 | + goto found; |
| 835 | + } |
| 836 | + } else if (!pte_none(pte)) { |
| 837 | + swp_entry_t entry = pte_to_swp_entry(pte); |
| 838 | + |
| 839 | + if ((flags & FW_MIGRATION) && |
| 840 | + is_migration_entry(entry)) { |
| 841 | + page = pfn_swap_entry_to_page(entry); |
| 842 | + expose_page = false; |
| 843 | + goto found; |
| 844 | + } |
| 845 | + } |
| 846 | + pte_unmap_unlock(ptep, ptl); |
| 847 | +not_found: |
| 848 | + vma_pgtable_walk_end(vma); |
| 849 | + return NULL; |
| 850 | +found: |
| 851 | + if (expose_page) |
| 852 | + /* Note: Offset from the mapped page, not the folio start. */ |
| 853 | + fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT); |
| 854 | + else |
| 855 | + fw->page = NULL; |
| 856 | + fw->ptl = ptl; |
| 857 | + return page_folio(page); |
| 858 | +} |
0 commit comments