From efdbaef3e11a8a82f741baddad7d47f8606798aa Mon Sep 17 00:00:00 2001 From: Andrea Reale Date: Fri, 28 Apr 2017 14:18:26 +0530 Subject: [PATCH] arm64: Hot-remove implementation for arm64 - arch_remove_memory interface - kernel page tables cleanup - vmemmap_free implementation for arm64 Change-Id: I8945b6b91ed7012abc1478de266302427ebeb639 Signed-off-by: Andrea Reale Signed-off-by: Maciej Bielski Patch-mainline: linux-kernel @ 11 Apr 2017, 18:25 Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Arun KS --- arch/arm64/Kconfig | 3 + arch/arm64/include/asm/mmu.h | 4 + arch/arm64/include/asm/pgtable.h | 15 ++ arch/arm64/mm/init.c | 32 +++- arch/arm64/mm/mmu.c | 383 ++++++++++++++++++++++++++++++++++++++- 5 files changed, 431 insertions(+), 6 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 104588d55777..3cb501b93da6 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -656,6 +656,9 @@ config ARCH_ENABLE_MEMORY_HOTPLUG depends on !NUMA def_bool y +config ARCH_ENABLE_MEMORY_HOTREMOVE + def_bool y + # The GPIO number here must be sorted by descending number. In case of # a multiplatform kernel, we just want the highest value required by the # selected platforms. diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 0944bfc04f5b..7ad6b91d12c5 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -37,6 +37,10 @@ extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, extern void *fixmap_remap_fdt(phys_addr_t dt_phys); #ifdef CONFIG_MEMORY_HOTPLUG extern void hotplug_paging(phys_addr_t start, phys_addr_t size); +#ifdef CONFIG_MEMORY_HOTREMOVE +extern void remove_pagetable(unsigned long start, + unsigned long end, bool direct); +#endif #endif #endif diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 72b1c3fa1576..ecd7dc14330c 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -461,6 +461,11 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) return pmd_val(pmd) & PHYS_MASK & (s32)PAGE_MASK; } +static inline unsigned long pmd_page_vaddr(pmd_t pmd) +{ + return (unsigned long) __va(pmd_page_paddr(pmd)); +} + /* Find an entry in the third-level page table. */ #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) @@ -512,6 +517,11 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) return pud_val(pud) & PHYS_MASK & (s32)PAGE_MASK; } +static inline unsigned long pud_page_vaddr(pud_t pud) +{ + return (unsigned long) __va(pud_page_paddr(pud)); +} + /* Find an entry in the second-level page table. */ #define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) @@ -564,6 +574,11 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) return pgd_val(pgd) & PHYS_MASK & (s32)PAGE_MASK; } +static inline unsigned long pgd_page_vaddr(pgd_t pgd) +{ + return (unsigned long) __va(pgd_page_paddr(pgd)); +} + /* Find an entry in the frst-level page table. */ #define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index bc45677d9ce6..75d363be3f36 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -505,7 +505,6 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) unsigned long nr_pages = size >> PAGE_SHIFT; unsigned long end_pfn = start_pfn + nr_pages; unsigned long max_sparsemem_pfn = 1UL << (MAX_PHYSMEM_BITS-PAGE_SHIFT); - unsigned long pfn; int ret; if (end_pfn > max_sparsemem_pfn) { @@ -578,5 +577,34 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) return ret; } -#endif +#ifdef CONFIG_MEMORY_HOTREMOVE +static void kernel_physical_mapping_remove(unsigned long start, + unsigned long end) +{ + start = (unsigned long)__va(start); + end = (unsigned long)__va(end); + + remove_pagetable(start, end, true); + +} + +int arch_remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long nr_pages = size >> PAGE_SHIFT; + struct page *page = pfn_to_page(start_pfn); + struct zone *zone; + int ret = 0; + + zone = page_zone(page); + ret = __remove_pages(zone, start_pfn, nr_pages); + WARN_ON_ONCE(ret); + + kernel_physical_mapping_remove(start, start + size); + + return ret; +} + +#endif /* CONFIG_MEMORY_HOTREMOVE */ +#endif /* CONFIG_MEMORY_HOTPLUG */ diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index cdb9338568f4..2b48e29ec30c 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -1,4 +1,3 @@ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt /* * Based on arch/arm/mm/mmu.c * @@ -131,7 +130,6 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, phys_addr_t pte_phys; BUG_ON(!pgtable_alloc); pte_phys = pgtable_alloc(); - pr_debug("Allocating PTE at %pK\n", __va(pte_phys)); pte = pte_set_fixmap(pte_phys); if (pmd_sect(*pmd)) split_pmd(pmd, pte); @@ -196,7 +194,6 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t pmd_phys; BUG_ON(!pgtable_alloc); pmd_phys = pgtable_alloc(); - pr_debug("Allocating PMD at %pK\n", __va(pmd_phys)); pmd = pmd_set_fixmap(pmd_phys); if (pud_sect(*pud)) { /* @@ -265,7 +262,6 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t pud_phys; BUG_ON(!pgtable_alloc); pud_phys = pgtable_alloc(); - pr_debug("Allocating PUD at %pK\n", __va(pud_phys)); __pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE); } BUG_ON(pgd_bad(*pgd)); @@ -648,7 +644,383 @@ void hotplug_paging(phys_addr_t start, phys_addr_t size) __free_pages(pg, 0); } +#ifdef CONFIG_MEMORY_HOTREMOVE +#define PAGE_INUSE 0xFD + +static void free_pagetable(struct page *page, int order, bool direct) +{ + unsigned long magic; + unsigned int nr_pages = 1 << order; + + /* bootmem page has reserved flag */ + if (PageReserved(page)) { + __ClearPageReserved(page); + + magic = (unsigned long)page->lru.next; + if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { + while (nr_pages--) + put_page_bootmem(page++); + } else { + while (nr_pages--) + free_reserved_page(page++); + } + } else { + /* + * Only direct pagetable allocation (those allocated via + * hotplug) call the pgtable_page_ctor; vmemmap pgtable + * allocations don't. + */ + if (direct) + pgtable_page_dtor(page); + + free_pages((unsigned long)page_address(page), order); + } +} + +static void free_pte_table(pmd_t *pmd, bool direct) +{ + pte_t *pte_start, *pte; + struct page *page; + int i; + + pte_start = (pte_t *) pmd_page_vaddr(*pmd); + /* Check if there is no valid entry in the PMD */ + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + page = pmd_page(*pmd); + + free_pagetable(page, 0, direct); + + /* + * This spin lock could be only taken in _pte_aloc_kernel + * in mm/memory.c and nowhere else (for arm64). Not sure if + * the function above can be called concurrently. In doubt, + * I am living it here for now, but it probably can be removed + */ + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); +} + +static void free_pmd_table(pud_t *pud, bool direct) +{ + pmd_t *pmd_start, *pmd; + struct page *page; + int i; + + pmd_start = (pmd_t *) pud_page_vaddr(*pud); + /* Check if there is no valid entry in the PMD */ + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + page = pud_page(*pud); + + free_pagetable(page, 0, direct); + + /* + * This spin lock could be only taken in _pte_aloc_kernel + * in mm/memory.c and nowhere else (for arm64). Not sure if + * the function above can be called concurrently. In doubt, + * I am living it here for now, but it probably can be removed + */ + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); +} + +/* + * When the PUD is folded on the PGD (three levels of paging), + * there's no need to free PUDs + */ +#if CONFIG_PGTABLE_LEVELS > 3 +static void free_pud_table(pgd_t *pgd, bool direct) +{ + pud_t *pud_start, *pud; + struct page *page; + int i; + + pud_start = (pud_t *) pgd_page_vaddr(*pgd); + /* Check if there is no valid entry in the PUD */ + for (i = 0; i < PTRS_PER_PUD; i++) { + pud = pud_start + i; + if (!pud_none(*pud)) + return; + } + + page = pgd_page(*pgd); + + free_pagetable(page, 0, direct); + + /* + * This spin lock could be only + * taken in _pte_aloc_kernel in + * mm/memory.c and nowhere else + * (for arm64). Not sure if the + * function above can be called + * concurrently. In doubt, + * I am living it here for now, + * but it probably can be removed. + */ + spin_lock(&init_mm.page_table_lock); + pgd_clear(pgd); + spin_unlock(&init_mm.page_table_lock); +} +#endif + +static void remove_pte_table(pte_t *pte, unsigned long addr, + unsigned long end, bool direct) +{ + unsigned long next; + void *page_addr; + + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { + /* + * Do not free direct mapping pages since they were + * freed when offlining, or simplely not in use. + */ + if (!direct) + free_pagetable(pte_page(*pte), 0, direct); + + /* + * This spin lock could be only + * taken in _pte_aloc_kernel in + * mm/memory.c and nowhere else + * (for arm64). Not sure if the + * function above can be called + * concurrently. In doubt, + * I am living it here for now, + * but it probably can be removed. + */ + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + } else { + /* + * If we are here, we are freeing vmemmap pages since + * direct mapped memory ranges to be freed are aligned. + * + * If we are not removing the whole page, it means + * other page structs in this page are being used and + * we canot remove them. So fill the unused page_structs + * with 0xFD, and remove the page when it is wholly + * filled with 0xFD. + */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pte_page(*pte)); + if (!memchr_inv(page_addr, PAGE_INUSE, PAGE_SIZE)) { + free_pagetable(pte_page(*pte), 0, direct); + + /* + * This spin lock could be only + * taken in _pte_aloc_kernel in + * mm/memory.c and nowhere else + * (for arm64). Not sure if the + * function above can be called + * concurrently. In doubt, + * I am living it here for now, + * but it probably can be removed. + */ + spin_lock(&init_mm.page_table_lock); + pte_clear(&init_mm, addr, pte); + spin_unlock(&init_mm.page_table_lock); + } + } + } + + // I am adding this flush here in simmetry to the x86 code. + // Why do I need to call it here and not in remove_p[mu]d + flush_tlb_all(); +} + +static void remove_pmd_table(pmd_t *pmd, unsigned long addr, + unsigned long end, bool direct) +{ + unsigned long next; + void *page_addr; + pte_t *pte; + + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + // check if we are using 2MB section mappings + if (pmd_sect(*pmd)) { + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { + if (!direct) { + free_pagetable(pmd_page(*pmd), + get_order(PMD_SIZE), direct); + } + /* + * This spin lock could be only + * taken in _pte_aloc_kernel in + * mm/memory.c and nowhere else + * (for arm64). Not sure if the + * function above can be called + * concurrently. In doubt, + * I am living it here for now, + * but it probably can be removed. + */ + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pmd_page(*pmd)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PMD_SIZE)) { + free_pagetable(pmd_page(*pmd), + get_order(PMD_SIZE), direct); + + /* + * This spin lock could be only + * taken in _pte_aloc_kernel in + * mm/memory.c and nowhere else + * (for arm64). Not sure if the + * function above can be called + * concurrently. In doubt, + * I am living it here for now, + * but it probably can be removed. + */ + spin_lock(&init_mm.page_table_lock); + pmd_clear(pmd); + spin_unlock(&init_mm.page_table_lock); + } + } + continue; + } + + BUG_ON(!pmd_table(*pmd)); + + pte = pte_offset_map(pmd, addr); + remove_pte_table(pte, addr, next, direct); + free_pte_table(pmd, direct); + } +} + +static void remove_pud_table(pud_t *pud, unsigned long addr, + unsigned long end, bool direct) +{ + unsigned long next; + pmd_t *pmd; + void *page_addr; + + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + if (!pud_present(*pud)) + continue; + /* + * If we are using 4K granules, check if we are using + * 1GB section mapping. + */ + if (pud_sect(*pud)) { + if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) { + if (!direct) { + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE), direct); + } + + /* + * This spin lock could be only + * taken in _pte_aloc_kernel in + * mm/memory.c and nowhere else + * (for arm64). Not sure if the + * function above can be called + * concurrently. In doubt, + * I am living it here for now, + * but it probably can be removed. + */ + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + } else { + /* If here, we are freeing vmemmap pages. */ + memset((void *)addr, PAGE_INUSE, next - addr); + + page_addr = page_address(pud_page(*pud)); + if (!memchr_inv(page_addr, PAGE_INUSE, + PUD_SIZE)) { + + free_pagetable(pud_page(*pud), + get_order(PUD_SIZE), direct); + + /* + * This spin lock could be only + * taken in _pte_aloc_kernel in + * mm/memory.c and nowhere else + * (for arm64). Not sure if the + * function above can be called + * concurrently. In doubt, + * I am living it here for now, + * but it probably can be removed. + */ + spin_lock(&init_mm.page_table_lock); + pud_clear(pud); + spin_unlock(&init_mm.page_table_lock); + } + } + continue; + } + + BUG_ON(!pud_table(*pud)); + + pmd = pmd_offset(pud, addr); + remove_pmd_table(pmd, addr, next, direct); + free_pmd_table(pud, direct); + } +} + +void remove_pagetable(unsigned long start, unsigned long end, bool direct) +{ + unsigned long next; + unsigned long addr; + pgd_t *pgd; + pud_t *pud; + + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (pgd_none(*pgd)) + continue; + + pud = pud_offset(pgd, addr); + remove_pud_table(pud, addr, next, direct); + /* + * When the PUD is folded on the PGD (three levels of paging), + * I did already clear the PMD page in free_pmd_table, + * and reset the corresponding PGD==PUD entry. + */ +#if CONFIG_PGTABLE_LEVELS > 3 + free_pud_table(pgd, direct); #endif + } + + flush_tlb_all(); +} + + +#endif /* CONFIG_MEMORY_HOTREMOVE */ +#endif /* CONFIG_MEMORY_HOTPLUG */ /* * Check whether a kernel address is valid (derived from arch/x86/). @@ -731,6 +1103,9 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node) #endif /* CONFIG_ARM64_64K_PAGES */ void vmemmap_free(unsigned long start, unsigned long end) { +#ifdef CONFIG_MEMORY_HOTREMOVE + remove_pagetable(start, end, false); +#endif } #endif /* CONFIG_SPARSEMEM_VMEMMAP */ -- 2.11.0