| // SPDX-License-Identifier: GPL-2.0-only |
| /* |
| * linux/mm/page_alloc.c |
| * |
| * Manages the free list, the system allocates free pages here. |
| * Note that kmalloc() lives in slab.c |
| * |
| * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds |
| * Swap reorganised 29.12.95, Stephen Tweedie |
| * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 |
| * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999 |
| * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999 |
| * Zone balancing, Kanoj Sarcar, SGI, Jan 2000 |
| * Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002 |
| * (lots of bits borrowed from Ingo Molnar & Andrew Morton) |
| */ |
| |
| #include <linux/stddef.h> |
| #include <linux/mm.h> |
| #include <linux/highmem.h> |
| #include <linux/swap.h> |
| #include <linux/swapops.h> |
| #include <linux/interrupt.h> |
| #include <linux/pagemap.h> |
| #include <linux/jiffies.h> |
| #include <linux/memblock.h> |
| #include <linux/compiler.h> |
| #include <linux/kernel.h> |
| #include <linux/kasan.h> |
| #include <linux/kmsan.h> |
| #include <linux/module.h> |
| #include <linux/suspend.h> |
| #include <linux/pagevec.h> |
| #include <linux/blkdev.h> |
| #include <linux/slab.h> |
| #include <linux/ratelimit.h> |
| #include <linux/oom.h> |
| #include <linux/topology.h> |
| #include <linux/sysctl.h> |
| #include <linux/cpu.h> |
| #include <linux/cpuset.h> |
| #include <linux/memory_hotplug.h> |
| #include <linux/nodemask.h> |
| #include <linux/vmalloc.h> |
| #include <linux/vmstat.h> |
| #include <linux/mempolicy.h> |
| #include <linux/memremap.h> |
| #include <linux/stop_machine.h> |
| #include <linux/random.h> |
| #include <linux/sort.h> |
| #include <linux/pfn.h> |
| #include <linux/backing-dev.h> |
| #include <linux/fault-inject.h> |
| #include <linux/page-isolation.h> |
| #include <linux/debugobjects.h> |
| #include <linux/kmemleak.h> |
| #include <linux/compaction.h> |
| #include <trace/events/kmem.h> |
| #include <trace/events/oom.h> |
| #include <linux/prefetch.h> |
| #include <linux/mm_inline.h> |
| #include <linux/mmu_notifier.h> |
| #include <linux/migrate.h> |
| #include <linux/hugetlb.h> |
| #include <linux/sched/rt.h> |
| #include <linux/sched/mm.h> |
| #include <linux/page_owner.h> |
| #include <linux/page_table_check.h> |
| #include <linux/kthread.h> |
| #include <linux/memcontrol.h> |
| #include <linux/ftrace.h> |
| #include <linux/lockdep.h> |
| #include <linux/nmi.h> |
| #include <linux/psi.h> |
| #include <linux/padata.h> |
| #include <linux/khugepaged.h> |
| #include <linux/buffer_head.h> |
| #include <linux/delayacct.h> |
| #include <asm/sections.h> |
| #include <asm/tlbflush.h> |
| #include <asm/div64.h> |
| #include "internal.h" |
| #include "shuffle.h" |
| #include "page_reporting.h" |
| #include "swap.h" |
| |
| /* Free Page Internal flags: for internal, non-pcp variants of free_pages(). */ |
| typedef int __bitwise fpi_t; |
| |
| /* No special request */ |
| #define FPI_NONE ((__force fpi_t)0) |
| |
| /* |
| * Skip free page reporting notification for the (possibly merged) page. |
| * This does not hinder free page reporting from grabbing the page, |
| * reporting it and marking it "reported" - it only skips notifying |
| * the free page reporting infrastructure about a newly freed page. For |
| * example, used when temporarily pulling a page from a freelist and |
| * putting it back unmodified. |
| */ |
| #define FPI_SKIP_REPORT_NOTIFY ((__force fpi_t)BIT(0)) |
| |
| /* |
| * Place the (possibly merged) page to the tail of the freelist. Will ignore |
| * page shuffling (relevant code - e.g., memory onlining - is expected to |
| * shuffle the whole zone). |
| * |
| * Note: No code should rely on this flag for correctness - it's purely |
| * to allow for optimizations when handing back either fresh pages |
| * (memory onlining) or untouched pages (page isolation, free page |
| * reporting). |
| */ |
| #define FPI_TO_TAIL ((__force fpi_t)BIT(1)) |
| |
| /* |
| * Don't poison memory with KASAN (only for the tag-based modes). |
| * During boot, all non-reserved memblock memory is exposed to page_alloc. |
| * Poisoning all that memory lengthens boot time, especially on systems with |
| * large amount of RAM. This flag is used to skip that poisoning. |
| * This is only done for the tag-based KASAN modes, as those are able to |
| * detect memory corruptions with the memory tags assigned by default. |
| * All memory allocated normally after boot gets poisoned as usual. |
| */ |
| #define FPI_SKIP_KASAN_POISON ((__force fpi_t)BIT(2)) |
| |
| /* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ |
| static DEFINE_MUTEX(pcp_batch_high_lock); |
| #define MIN_PERCPU_PAGELIST_HIGH_FRACTION (8) |
| |
| #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) |
| /* |
| * On SMP, spin_trylock is sufficient protection. |
| * On PREEMPT_RT, spin_trylock is equivalent on both SMP and UP. |
| */ |
| #define pcp_trylock_prepare(flags) do { } while (0) |
| #define pcp_trylock_finish(flag) do { } while (0) |
| #else |
| |
| /* UP spin_trylock always succeeds so disable IRQs to prevent re-entrancy. */ |
| #define pcp_trylock_prepare(flags) local_irq_save(flags) |
| #define pcp_trylock_finish(flags) local_irq_restore(flags) |
| #endif |
| |
| /* |
| * Locking a pcp requires a PCP lookup followed by a spinlock. To avoid |
| * a migration causing the wrong PCP to be locked and remote memory being |
| * potentially allocated, pin the task to the CPU for the lookup+lock. |
| * preempt_disable is used on !RT because it is faster than migrate_disable. |
| * migrate_disable is used on RT because otherwise RT spinlock usage is |
| * interfered with and a high priority task cannot preempt the allocator. |
| */ |
| #ifndef CONFIG_PREEMPT_RT |
| #define pcpu_task_pin() preempt_disable() |
| #define pcpu_task_unpin() preempt_enable() |
| #else |
| #define pcpu_task_pin() migrate_disable() |
| #define pcpu_task_unpin() migrate_enable() |
| #endif |
| |
| /* |
| * Generic helper to lookup and a per-cpu variable with an embedded spinlock. |
| * Return value should be used with equivalent unlock helper. |
| */ |
| #define pcpu_spin_lock(type, member, ptr) \ |
| ({ \ |
| type *_ret; \ |
| pcpu_task_pin(); \ |
| _ret = this_cpu_ptr(ptr); \ |
| spin_lock(&_ret->member); \ |
| _ret; \ |
| }) |
| |
| #define pcpu_spin_trylock(type, member, ptr) \ |
| ({ \ |
| type *_ret; \ |
| pcpu_task_pin(); \ |
| _ret = this_cpu_ptr(ptr); \ |
| if (!spin_trylock(&_ret->member)) { \ |
| pcpu_task_unpin(); \ |
| _ret = NULL; \ |
| } \ |
| _ret; \ |
| }) |
| |
| #define pcpu_spin_unlock(member, ptr) \ |
| ({ \ |
| spin_unlock(&ptr->member); \ |
| pcpu_task_unpin(); \ |
| }) |
| |
| /* struct per_cpu_pages specific helpers. */ |
| #define pcp_spin_lock(ptr) \ |
| pcpu_spin_lock(struct per_cpu_pages, lock, ptr) |
| |
| #define pcp_spin_trylock(ptr) \ |
| pcpu_spin_trylock(struct per_cpu_pages, lock, ptr) |
| |
| #define pcp_spin_unlock(ptr) \ |
| pcpu_spin_unlock(lock, ptr) |
| |
| #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID |
| DEFINE_PER_CPU(int, numa_node); |
| EXPORT_PER_CPU_SYMBOL(numa_node); |
| #endif |
| |
| DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); |
| |
| #ifdef CONFIG_HAVE_MEMORYLESS_NODES |
| /* |
| * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. |
| * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. |
| * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() |
| * defined in <linux/topology.h>. |
| */ |
| DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ |
| EXPORT_PER_CPU_SYMBOL(_numa_mem_); |
| #endif |
| |
| static DEFINE_MUTEX(pcpu_drain_mutex); |
| |
| #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY |
| volatile unsigned long latent_entropy __latent_entropy; |
| EXPORT_SYMBOL(latent_entropy); |
| #endif |
| |
| /* |
| * Array of node states. |
| */ |
| nodemask_t node_states[NR_NODE_STATES] __read_mostly = { |
| [N_POSSIBLE] = NODE_MASK_ALL, |
| [N_ONLINE] = { { [0] = 1UL } }, |
| #ifndef CONFIG_NUMA |
| [N_NORMAL_MEMORY] = { { [0] = 1UL } }, |
| #ifdef CONFIG_HIGHMEM |
| [N_HIGH_MEMORY] = { { [0] = 1UL } }, |
| #endif |
| [N_MEMORY] = { { [0] = 1UL } }, |
| [N_CPU] = { { [0] = 1UL } }, |
| #endif /* NUMA */ |
| }; |
| EXPORT_SYMBOL(node_states); |
| |
| atomic_long_t _totalram_pages __read_mostly; |
| EXPORT_SYMBOL(_totalram_pages); |
| unsigned long totalreserve_pages __read_mostly; |
| unsigned long totalcma_pages __read_mostly; |
| |
| int percpu_pagelist_high_fraction; |
| gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; |
| DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, init_on_alloc); |
| EXPORT_SYMBOL(init_on_alloc); |
| |
| DEFINE_STATIC_KEY_MAYBE(CONFIG_INIT_ON_FREE_DEFAULT_ON, init_on_free); |
| EXPORT_SYMBOL(init_on_free); |
| |
| static bool _init_on_alloc_enabled_early __read_mostly |
| = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON); |
| static int __init early_init_on_alloc(char *buf) |
| { |
| |
| return kstrtobool(buf, &_init_on_alloc_enabled_early); |
| } |
| early_param("init_on_alloc", early_init_on_alloc); |
| |
| static bool _init_on_free_enabled_early __read_mostly |
| = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON); |
| static int __init early_init_on_free(char *buf) |
| { |
| return kstrtobool(buf, &_init_on_free_enabled_early); |
| } |
| early_param("init_on_free", early_init_on_free); |
| |
| /* |
| * A cached value of the page's pageblock's migratetype, used when the page is |
| * put on a pcplist. Used to avoid the pageblock migratetype lookup when |
| * freeing from pcplists in most cases, at the cost of possibly becoming stale. |
| * Also the migratetype set in the page does not necessarily match the pcplist |
| * index, e.g. page might have MIGRATE_CMA set but be on a pcplist with any |
| * other index - this ensures that it will be put on the correct CMA freelist. |
| */ |
| static inline int get_pcppage_migratetype(struct page *page) |
| { |
| return page->index; |
| } |
| |
| static inline void set_pcppage_migratetype(struct page *page, int migratetype) |
| { |
| page->index = migratetype; |
| } |
| |
| #ifdef CONFIG_PM_SLEEP |
| /* |
| * The following functions are used by the suspend/hibernate code to temporarily |
| * change gfp_allowed_mask in order to avoid using I/O during memory allocations |
| * while devices are suspended. To avoid races with the suspend/hibernate code, |
| * they should always be called with system_transition_mutex held |
| * (gfp_allowed_mask also should only be modified with system_transition_mutex |
| * held, unless the suspend/hibernate code is guaranteed not to run in parallel |
| * with that modification). |
| */ |
| |
| static gfp_t saved_gfp_mask; |
| |
| void pm_restore_gfp_mask(void) |
| { |
| WARN_ON(!mutex_is_locked(&system_transition_mutex)); |
| if (saved_gfp_mask) { |
| gfp_allowed_mask = saved_gfp_mask; |
| saved_gfp_mask = 0; |
| } |
| } |
| |
| void pm_restrict_gfp_mask(void) |
| { |
| WARN_ON(!mutex_is_locked(&system_transition_mutex)); |
| WARN_ON(saved_gfp_mask); |
| saved_gfp_mask = gfp_allowed_mask; |
| gfp_allowed_mask &= ~(__GFP_IO | __GFP_FS); |
| } |
| |
| bool pm_suspended_storage(void) |
| { |
| if ((gfp_allowed_mask & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) |
| return false; |
| return true; |
| } |
| #endif /* CONFIG_PM_SLEEP */ |
| |
| #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE |
| unsigned int pageblock_order __read_mostly; |
| #endif |
| |
| static void __free_pages_ok(struct page *page, unsigned int order, |
| fpi_t fpi_flags); |
| |
| /* |
| * results with 256, 32 in the lowmem_reserve sysctl: |
| * 1G machine -> (16M dma, 800M-16M normal, 1G-800M high) |
| * 1G machine -> (16M dma, 784M normal, 224M high) |
| * NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA |
| * HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL |
| * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA |
| * |
| * TBD: should special case ZONE_DMA32 machines here - in those we normally |
| * don't need any ZONE_NORMAL reservation |
| */ |
| int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES] = { |
| #ifdef CONFIG_ZONE_DMA |
| [ZONE_DMA] = 256, |
| #endif |
| #ifdef CONFIG_ZONE_DMA32 |
| [ZONE_DMA32] = 256, |
| #endif |
| [ZONE_NORMAL] = 32, |
| #ifdef CONFIG_HIGHMEM |
| [ZONE_HIGHMEM] = 0, |
| #endif |
| [ZONE_MOVABLE] = 0, |
| }; |
| |
| static char * const zone_names[MAX_NR_ZONES] = { |
| #ifdef CONFIG_ZONE_DMA |
| "DMA", |
| #endif |
| #ifdef CONFIG_ZONE_DMA32 |
| "DMA32", |
| #endif |
| "Normal", |
| #ifdef CONFIG_HIGHMEM |
| "HighMem", |
| #endif |
| "Movable", |
| #ifdef CONFIG_ZONE_DEVICE |
| "Device", |
| #endif |
| }; |
| |
| const char * const migratetype_names[MIGRATE_TYPES] = { |
| "Unmovable", |
| "Movable", |
| "Reclaimable", |
| "HighAtomic", |
| #ifdef CONFIG_CMA |
| "CMA", |
| #endif |
| #ifdef CONFIG_MEMORY_ISOLATION |
| "Isolate", |
| #endif |
| }; |
| |
| compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = { |
| [NULL_COMPOUND_DTOR] = NULL, |
| [COMPOUND_PAGE_DTOR] = free_compound_page, |
| #ifdef CONFIG_HUGETLB_PAGE |
| [HUGETLB_PAGE_DTOR] = free_huge_page, |
| #endif |
| #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| [TRANSHUGE_PAGE_DTOR] = free_transhuge_page, |
| #endif |
| }; |
| |
| int min_free_kbytes = 1024; |
| int user_min_free_kbytes = -1; |
| int watermark_boost_factor __read_mostly = 15000; |
| int watermark_scale_factor = 10; |
| |
| static unsigned long nr_kernel_pages __initdata; |
| static unsigned long nr_all_pages __initdata; |
| static unsigned long dma_reserve __initdata; |
| |
| static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata; |
| static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata; |
| static unsigned long required_kernelcore __initdata; |
| static unsigned long required_kernelcore_percent __initdata; |
| static unsigned long required_movablecore __initdata; |
| static unsigned long required_movablecore_percent __initdata; |
| static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata; |
| bool mirrored_kernelcore __initdata_memblock; |
| |
| /* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */ |
| int movable_zone; |
| EXPORT_SYMBOL(movable_zone); |
| |
| #if MAX_NUMNODES > 1 |
| unsigned int nr_node_ids __read_mostly = MAX_NUMNODES; |
| unsigned int nr_online_nodes __read_mostly = 1; |
| EXPORT_SYMBOL(nr_node_ids); |
| EXPORT_SYMBOL(nr_online_nodes); |
| #endif |
| |
| static bool page_contains_unaccepted(struct page *page, unsigned int order); |
| static void accept_page(struct page *page, unsigned int order); |
| static bool try_to_accept_memory(struct zone *zone, unsigned int order); |
| static inline bool has_unaccepted_memory(void); |
| static bool __free_unaccepted(struct page *page); |
| |
| int page_group_by_mobility_disabled __read_mostly; |
| |
| #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
| /* |
| * During boot we initialize deferred pages on-demand, as needed, but once |
| * page_alloc_init_late() has finished, the deferred pages are all initialized, |
| * and we can permanently disable that path. |
| */ |
| static DEFINE_STATIC_KEY_TRUE(deferred_pages); |
| |
| static inline bool deferred_pages_enabled(void) |
| { |
| return static_branch_unlikely(&deferred_pages); |
| } |
| |
| /* Returns true if the struct page for the pfn is uninitialised */ |
| static inline bool __meminit early_page_uninitialised(unsigned long pfn) |
| { |
| int nid = early_pfn_to_nid(pfn); |
| |
| if (node_online(nid) && pfn >= NODE_DATA(nid)->first_deferred_pfn) |
| return true; |
| |
| return false; |
| } |
| |
| /* |
| * Returns true when the remaining initialisation should be deferred until |
| * later in the boot cycle when it can be parallelised. |
| */ |
| static bool __meminit |
| defer_init(int nid, unsigned long pfn, unsigned long end_pfn) |
| { |
| static unsigned long prev_end_pfn, nr_initialised; |
| |
| if (early_page_ext_enabled()) |
| return false; |
| /* |
| * prev_end_pfn static that contains the end of previous zone |
| * No need to protect because called very early in boot before smp_init. |
| */ |
| if (prev_end_pfn != end_pfn) { |
| prev_end_pfn = end_pfn; |
| nr_initialised = 0; |
| } |
| |
| /* Always populate low zones for address-constrained allocations */ |
| if (end_pfn < pgdat_end_pfn(NODE_DATA(nid))) |
| return false; |
| |
| if (NODE_DATA(nid)->first_deferred_pfn != ULONG_MAX) |
| return true; |
| /* |
| * We start only with one section of pages, more pages are added as |
| * needed until the rest of deferred pages are initialized. |
| */ |
| nr_initialised++; |
| if ((nr_initialised > PAGES_PER_SECTION) && |
| (pfn & (PAGES_PER_SECTION - 1)) == 0) { |
| NODE_DATA(nid)->first_deferred_pfn = pfn; |
| return true; |
| } |
| return false; |
| } |
| #else |
| static inline bool deferred_pages_enabled(void) |
| { |
| return false; |
| } |
| |
| static inline bool early_page_uninitialised(unsigned long pfn) |
| { |
| return false; |
| } |
| |
| static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn) |
| { |
| return false; |
| } |
| #endif |
| |
| /* Return a pointer to the bitmap storing bits affecting a block of pages */ |
| static inline unsigned long *get_pageblock_bitmap(const struct page *page, |
| unsigned long pfn) |
| { |
| #ifdef CONFIG_SPARSEMEM |
| return section_to_usemap(__pfn_to_section(pfn)); |
| #else |
| return page_zone(page)->pageblock_flags; |
| #endif /* CONFIG_SPARSEMEM */ |
| } |
| |
| static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) |
| { |
| #ifdef CONFIG_SPARSEMEM |
| pfn &= (PAGES_PER_SECTION-1); |
| #else |
| pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn); |
| #endif /* CONFIG_SPARSEMEM */ |
| return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; |
| } |
| |
| static __always_inline |
| unsigned long __get_pfnblock_flags_mask(const struct page *page, |
| unsigned long pfn, |
| unsigned long mask) |
| { |
| unsigned long *bitmap; |
| unsigned long bitidx, word_bitidx; |
| unsigned long word; |
| |
| bitmap = get_pageblock_bitmap(page, pfn); |
| bitidx = pfn_to_bitidx(page, pfn); |
| word_bitidx = bitidx / BITS_PER_LONG; |
| bitidx &= (BITS_PER_LONG-1); |
| /* |
| * This races, without locks, with set_pfnblock_flags_mask(). Ensure |
| * a consistent read of the memory array, so that results, even though |
| * racy, are not corrupted. |
| */ |
| word = READ_ONCE(bitmap[word_bitidx]); |
| return (word >> bitidx) & mask; |
| } |
| |
| /** |
| * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages |
| * @page: The page within the block of interest |
| * @pfn: The target page frame number |
| * @mask: mask of bits that the caller is interested in |
| * |
| * Return: pageblock_bits flags |
| */ |
| unsigned long get_pfnblock_flags_mask(const struct page *page, |
| unsigned long pfn, unsigned long mask) |
| { |
| return __get_pfnblock_flags_mask(page, pfn, mask); |
| } |
| |
| static __always_inline int get_pfnblock_migratetype(const struct page *page, |
| unsigned long pfn) |
| { |
| return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK); |
| } |
| |
| /** |
| * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages |
| * @page: The page within the block of interest |
| * @flags: The flags to set |
| * @pfn: The target page frame number |
| * @mask: mask of bits that the caller is interested in |
| */ |
| void set_pfnblock_flags_mask(struct page *page, unsigned long flags, |
| unsigned long pfn, |
| unsigned long mask) |
| { |
| unsigned long *bitmap; |
| unsigned long bitidx, word_bitidx; |
| unsigned long word; |
| |
| BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); |
| BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits)); |
| |
| bitmap = get_pageblock_bitmap(page, pfn); |
| bitidx = pfn_to_bitidx(page, pfn); |
| word_bitidx = bitidx / BITS_PER_LONG; |
| bitidx &= (BITS_PER_LONG-1); |
| |
| VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page); |
| |
| mask <<= bitidx; |
| flags <<= bitidx; |
| |
| word = READ_ONCE(bitmap[word_bitidx]); |
| do { |
| } while (!try_cmpxchg(&bitmap[word_bitidx], &word, (word & ~mask) | flags)); |
| } |
| |
| void set_pageblock_migratetype(struct page *page, int migratetype) |
| { |
| if (unlikely(page_group_by_mobility_disabled && |
| migratetype < MIGRATE_PCPTYPES)) |
| migratetype = MIGRATE_UNMOVABLE; |
| |
| set_pfnblock_flags_mask(page, (unsigned long)migratetype, |
| page_to_pfn(page), MIGRATETYPE_MASK); |
| } |
| |
| #ifdef CONFIG_DEBUG_VM |
| static int page_outside_zone_boundaries(struct zone *zone, struct page *page) |
| { |
| int ret = 0; |
| unsigned seq; |
| unsigned long pfn = page_to_pfn(page); |
| unsigned long sp, start_pfn; |
| |
| do { |
| seq = zone_span_seqbegin(zone); |
| start_pfn = zone->zone_start_pfn; |
| sp = zone->spanned_pages; |
| if (!zone_spans_pfn(zone, pfn)) |
| ret = 1; |
| } while (zone_span_seqretry(zone, seq)); |
| |
| if (ret) |
| pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", |
| pfn, zone_to_nid(zone), zone->name, |
| start_pfn, start_pfn + sp); |
| |
| return ret; |
| } |
| |
| static int page_is_consistent(struct zone *zone, struct page *page) |
| { |
| if (zone != page_zone(page)) |
| return 0; |
| |
| return 1; |
| } |
| /* |
| * Temporary debugging check for pages not lying within a given zone. |
| */ |
| static int __maybe_unused bad_range(struct zone *zone, struct page *page) |
| { |
| if (page_outside_zone_boundaries(zone, page)) |
| return 1; |
| if (!page_is_consistent(zone, page)) |
| return 1; |
| |
| return 0; |
| } |
| #else |
| static inline int __maybe_unused bad_range(struct zone *zone, struct page *page) |
| { |
| return 0; |
| } |
| #endif |
| |
| static void bad_page(struct page *page, const char *reason) |
| { |
| static unsigned long resume; |
| static unsigned long nr_shown; |
| static unsigned long nr_unshown; |
| |
| /* |
| * Allow a burst of 60 reports, then keep quiet for that minute; |
| * or allow a steady drip of one report per second. |
| */ |
| if (nr_shown == 60) { |
| if (time_before(jiffies, resume)) { |
| nr_unshown++; |
| goto out; |
| } |
| if (nr_unshown) { |
| pr_alert( |
| "BUG: Bad page state: %lu messages suppressed\n", |
| nr_unshown); |
| nr_unshown = 0; |
| } |
| nr_shown = 0; |
| } |
| if (nr_shown++ == 0) |
| resume = jiffies + 60 * HZ; |
| |
| pr_alert("BUG: Bad page state in process %s pfn:%05lx\n", |
| current->comm, page_to_pfn(page)); |
| dump_page(page, reason); |
| |
| print_modules(); |
| dump_stack(); |
| out: |
| /* Leave bad fields for debug, except PageBuddy could make trouble */ |
| page_mapcount_reset(page); /* remove PageBuddy */ |
| add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); |
| } |
| |
| static inline unsigned int order_to_pindex(int migratetype, int order) |
| { |
| int base = order; |
| |
| #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| if (order > PAGE_ALLOC_COSTLY_ORDER) { |
| VM_BUG_ON(order != pageblock_order); |
| return NR_LOWORDER_PCP_LISTS; |
| } |
| #else |
| VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); |
| #endif |
| |
| return (MIGRATE_PCPTYPES * base) + migratetype; |
| } |
| |
| static inline int pindex_to_order(unsigned int pindex) |
| { |
| int order = pindex / MIGRATE_PCPTYPES; |
| |
| #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| if (pindex == NR_LOWORDER_PCP_LISTS) |
| order = pageblock_order; |
| #else |
| VM_BUG_ON(order > PAGE_ALLOC_COSTLY_ORDER); |
| #endif |
| |
| return order; |
| } |
| |
| static inline bool pcp_allowed_order(unsigned int order) |
| { |
| if (order <= PAGE_ALLOC_COSTLY_ORDER) |
| return true; |
| #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
| if (order == pageblock_order) |
| return true; |
| #endif |
| return false; |
| } |
| |
| static inline void free_the_page(struct page *page, unsigned int order) |
| { |
| if (pcp_allowed_order(order)) /* Via pcp? */ |
| free_unref_page(page, order); |
| else |
| __free_pages_ok(page, order, FPI_NONE); |
| } |
| |
| /* |
| * Higher-order pages are called "compound pages". They are structured thusly: |
| * |
| * The first PAGE_SIZE page is called the "head page" and have PG_head set. |
| * |
| * The remaining PAGE_SIZE pages are called "tail pages". PageTail() is encoded |
| * in bit 0 of page->compound_head. The rest of bits is pointer to head page. |
| * |
| * The first tail page's ->compound_dtor holds the offset in array of compound |
| * page destructors. See compound_page_dtors. |
| * |
| * The first tail page's ->compound_order holds the order of allocation. |
| * This usage means that zero-order pages may not be compound. |
| */ |
| |
| void free_compound_page(struct page *page) |
| { |
| mem_cgroup_uncharge(page_folio(page)); |
| free_the_page(page, compound_order(page)); |
| } |
| |
| static void prep_compound_head(struct page *page, unsigned int order) |
| { |
| set_compound_page_dtor(page, COMPOUND_PAGE_DTOR); |
| set_compound_order(page, order); |
| atomic_set(compound_mapcount_ptr(page), -1); |
| atomic_set(compound_pincount_ptr(page), 0); |
| } |
| |
| static void prep_compound_tail(struct page *head, int tail_idx) |
| { |
| struct page *p = head + tail_idx; |
| |
| p->mapping = TAIL_MAPPING; |
| set_compound_head(p, head); |
| set_page_private(p, 0); |
| } |
| |
| void prep_compound_page(struct page *page, unsigned int order) |
| { |
| int i; |
| int nr_pages = 1 << order; |
| |
| __SetPageHead(page); |
| for (i = 1; i < nr_pages; i++) |
| prep_compound_tail(page, i); |
| |
| prep_compound_head(page, order); |
| } |
| |
| void destroy_large_folio(struct folio *folio) |
| { |
| enum compound_dtor_id dtor = folio_page(folio, 1)->compound_dtor; |
| |
| VM_BUG_ON_FOLIO(dtor >= NR_COMPOUND_DTORS, folio); |
| compound_page_dtors[dtor](&folio->page); |
| } |
| |
| #ifdef CONFIG_DEBUG_PAGEALLOC |
| unsigned int _debug_guardpage_minorder; |
| |
| bool _debug_pagealloc_enabled_early __read_mostly |
| = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT); |
| EXPORT_SYMBOL(_debug_pagealloc_enabled_early); |
| DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled); |
| EXPORT_SYMBOL(_debug_pagealloc_enabled); |
| |
| DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled); |
| |
| static int __init early_debug_pagealloc(char *buf) |
| { |
| return kstrtobool(buf, &_debug_pagealloc_enabled_early); |
| } |
| early_param("debug_pagealloc", early_debug_pagealloc); |
| |
| static int __init debug_guardpage_minorder_setup(char *buf) |
| { |
| unsigned long res; |
| |
| if (kstrtoul(buf, 10, &res) < 0 || res > MAX_ORDER / 2) { |
| pr_err("Bad debug_guardpage_minorder value\n"); |
| return 0; |
| } |
| _debug_guardpage_minorder = res; |
| pr_info("Setting debug_guardpage_minorder to %lu\n", res); |
| return 0; |
| } |
| early_param("debug_guardpage_minorder", debug_guardpage_minorder_setup); |
| |
| static inline bool set_page_guard(struct zone *zone, struct page *page, |
| unsigned int order, int migratetype) |
| { |
| if (!debug_guardpage_enabled()) |
| return false; |
| |
| if (order >= debug_guardpage_minorder()) |
| return false; |
| |
| __SetPageGuard(page); |
| INIT_LIST_HEAD(&page->buddy_list); |
| set_page_private(page, order); |
| /* Guard pages are not available for any usage */ |
| if (!is_migrate_isolate(migratetype)) |
| __mod_zone_freepage_state(zone, -(1 << order), migratetype); |
| |
| return true; |
| } |
| |
| static inline void clear_page_guard(struct zone *zone, struct page *page, |
| unsigned int order, int migratetype) |
| { |
| if (!debug_guardpage_enabled()) |
| return; |
| |
| __ClearPageGuard(page); |
| |
| set_page_private(page, 0); |
| if (!is_migrate_isolate(migratetype)) |
| __mod_zone_freepage_state(zone, (1 << order), migratetype); |
| } |
| #else |
| static inline bool set_page_guard(struct zone *zone, struct page *page, |
| unsigned int order, int migratetype) { return false; } |
| static inline void clear_page_guard(struct zone *zone, struct page *page, |
| unsigned int order, int migratetype) {} |
| #endif |
| |
| /* |
| * Enable static keys related to various memory debugging and hardening options. |
| * Some override others, and depend on early params that are evaluated in the |
| * order of appearance. So we need to first gather the full picture of what was |
| * enabled, and then make decisions. |
| */ |
| void __init init_mem_debugging_and_hardening(void) |
| { |
| bool page_poisoning_requested = false; |
| |
| #ifdef CONFIG_PAGE_POISONING |
| /* |
| * Page poisoning is debug page alloc for some arches. If |
| * either of those options are enabled, enable poisoning. |
| */ |
| if (page_poisoning_enabled() || |
| (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && |
| debug_pagealloc_enabled())) { |
| static_branch_enable(&_page_poisoning_enabled); |
| page_poisoning_requested = true; |
| } |
| #endif |
| |
| if ((_init_on_alloc_enabled_early || _init_on_free_enabled_early) && |
| page_poisoning_requested) { |
| pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " |
| "will take precedence over init_on_alloc and init_on_free\n"); |
| _init_on_alloc_enabled_early = false; |
| _init_on_free_enabled_early = false; |
| } |
| |
| if (_init_on_alloc_enabled_early) |
| static_branch_enable(&init_on_alloc); |
| else |
| static_branch_disable(&init_on_alloc); |
| |
| if (_init_on_free_enabled_early) |
| static_branch_enable(&init_on_free); |
| else |
| static_branch_disable(&init_on_free); |
| |
| if (IS_ENABLED(CONFIG_KMSAN) && |
| (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) |
| pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); |
| |
| #ifdef CONFIG_DEBUG_PAGEALLOC |
| if (!debug_pagealloc_enabled()) |
| return; |
| |
| static_branch_enable(&_debug_pagealloc_enabled); |
| |
| if (!debug_guardpage_minorder()) |
| return; |
| |
| static_branch_enable(&_debug_guardpage_enabled); |
| #endif |
| } |
| |
| static inline void set_buddy_order(struct page *page, unsigned int order) |
| { |
| set_page_private(page, order); |
| __SetPageBuddy(page); |
| } |
| |
| #ifdef CONFIG_COMPACTION |
| static inline struct capture_control *task_capc(struct zone *zone) |
| { |
| struct capture_control *capc = current->capture_control; |
| |
| return unlikely(capc) && |
| !(current->flags & PF_KTHREAD) && |
| !capc->page && |
| capc->cc->zone == zone ? capc : NULL; |
| } |
| |
| static inline bool |
| compaction_capture(struct capture_control *capc, struct page *page, |
| int order, int migratetype) |
| { |
| if (!capc || order != capc->cc->order) |
| return false; |
| |
| /* Do not accidentally pollute CMA or isolated regions*/ |
| if (is_migrate_cma(migratetype) || |
| is_migrate_isolate(migratetype)) |
| return false; |
| |
| /* |
| * Do not let lower order allocations pollute a movable pageblock. |
| * This might let an unmovable request use a reclaimable pageblock |
| * and vice-versa but no more than normal fallback logic which can |
| * have trouble finding a high-order free page. |
| */ |
| if (order < pageblock_order && migratetype == MIGRATE_MOVABLE) |
| return false; |
| |
| capc->page = page; |
| return true; |
| } |
| |
| #else |
| static inline struct capture_control *task_capc(struct zone *zone) |
| { |
| return NULL; |
| } |
| |
| static inline bool |
| compaction_capture(struct capture_control *capc, struct page *page, |
| int order, int migratetype) |
| { |
| return false; |
| } |
| #endif /* CONFIG_COMPACTION */ |
| |
| /* Used for pages not on another list */ |
| static inline void add_to_free_list(struct page *page, struct zone *zone, |
| unsigned int order, int migratetype) |
| { |
| struct free_area *area = &zone->free_area[order]; |
| |
| list_add(&page->buddy_list, &area->free_list[migratetype]); |
| area->nr_free++; |
| } |
| |
| /* Used for pages not on another list */ |
| static inline void add_to_free_list_tail(struct page *page, struct zone *zone, |
| unsigned int order, int migratetype) |
| { |
| struct free_area *area = &zone->free_area[order]; |
| |
| list_add_tail(&page->buddy_list, &area->free_list[migratetype]); |
| area->nr_free++; |
| } |
| |
| /* |
| * Used for pages which are on another list. Move the pages to the tail |
| * of the list - so the moved pages won't immediately be considered for |
| * allocation again (e.g., optimization for memory onlining). |
| */ |
| static inline void move_to_free_list(struct page *page, struct zone *zone, |
| unsigned int order, int migratetype) |
| { |
| struct free_area *area = &zone->free_area[order]; |
| |
| list_move_tail(&page->buddy_list, &area->free_list[migratetype]); |
| } |
| |
| static inline void del_page_from_free_list(struct page *page, struct zone *zone, |
| unsigned int order) |
| { |
| /* clear reported state and update reported page count */ |
| if (page_reported(page)) |
| __ClearPageReported(page); |
| |
| list_del(&page->buddy_list); |
| __ClearPageBuddy(page); |
| set_page_private(page, 0); |
| zone->free_area[order].nr_free--; |
| } |
| |
| /* |
| * If this is not the largest possible page, check if the buddy |
| * of the next-highest order is free. If it is, it's possible |
| * that pages are being freed that will coalesce soon. In case, |
| * that is happening, add the free page to the tail of the list |
| * so it's less likely to be used soon and more likely to be merged |
| * as a higher order page |
| */ |
| static inline bool |
| buddy_merge_likely(unsigned long pfn, unsigned long buddy_pfn, |
| struct page *page, unsigned int order) |
| { |
| unsigned long higher_page_pfn; |
| struct page *higher_page; |
| |
| if (order >= MAX_ORDER - 2) |
| return false; |
| |
| higher_page_pfn = buddy_pfn & pfn; |
| higher_page = page + (higher_page_pfn - pfn); |
| |
| return find_buddy_page_pfn(higher_page, higher_page_pfn, order + 1, |
| NULL) != NULL; |
| } |
| |
| /* |
| * Freeing function for a buddy system allocator. |
| * |
| * The concept of a buddy system is to maintain direct-mapped table |
| * (containing bit values) for memory blocks of various "orders". |
| * The bottom level table contains the map for the smallest allocatable |
| * units of memory (here, pages), and each level above it describes |
| * pairs of units from the levels below, hence, "buddies". |
| * At a high level, all that happens here is marking the table entry |
| * at the bottom level available, and propagating the changes upward |
| * as necessary, plus some accounting needed to play nicely with other |
| * parts of the VM system. |
| * At each level, we keep a list of pages, which are heads of continuous |
| * free pages of length of (1 << order) and marked with PageBuddy. |
| * Page's order is recorded in page_private(page) field. |
| * So when we are allocating or freeing one, we can derive the state of the |
| * other. That is, if we allocate a small block, and both were |
| * free, the remainder of the region must be split into blocks. |
| * If a block is freed, and its buddy is also free, then this |
| * triggers coalescing into a block of larger size. |
| * |
| * -- nyc |
| */ |
| |
| static inline void __free_one_page(struct page *page, |
| unsigned long pfn, |
| struct zone *zone, unsigned int order, |
| int migratetype, fpi_t fpi_flags) |
| { |
| struct capture_control *capc = task_capc(zone); |
| unsigned long buddy_pfn = 0; |
| unsigned long combined_pfn; |
| struct page *buddy; |
| bool to_tail; |
| |
| VM_BUG_ON(!zone_is_initialized(zone)); |
| VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page); |
| |
| VM_BUG_ON(migratetype == -1); |
| if (likely(!is_migrate_isolate(migratetype))) |
| __mod_zone_freepage_state(zone, 1 << order, migratetype); |
| |
| VM_BUG_ON_PAGE(pfn & ((1 << order) - 1), page); |
| VM_BUG_ON_PAGE(bad_range(zone, page), page); |
| |
| while (order < MAX_ORDER - 1) { |
| if (compaction_capture(capc, page, order, migratetype)) { |
| __mod_zone_freepage_state(zone, -(1 << order), |
| migratetype); |
| return; |
| } |
| |
| buddy = find_buddy_page_pfn(page, pfn, order, &buddy_pfn); |
| if (!buddy) |
| goto done_merging; |
| |
| if (unlikely(order >= pageblock_order)) { |
| /* |
| * We want to prevent merge between freepages on pageblock |
| * without fallbacks and normal pageblock. Without this, |
| * pageblock isolation could cause incorrect freepage or CMA |
| * accounting or HIGHATOMIC accounting. |
| */ |
| int buddy_mt = get_pageblock_migratetype(buddy); |
| |
| if (migratetype != buddy_mt |
| && (!migratetype_is_mergeable(migratetype) || |
| !migratetype_is_mergeable(buddy_mt))) |
| goto done_merging; |
| } |
| |
| /* |
| * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page, |
| * merge with it and move up one order. |
| */ |
| if (page_is_guard(buddy)) |
| clear_page_guard(zone, buddy, order, migratetype); |
| else |
| del_page_from_free_list(buddy, zone, order); |
| combined_pfn = buddy_pfn & pfn; |
| page = page + (combined_pfn - pfn); |
| pfn = combined_pfn; |
| order++; |
| } |
| |
| done_merging: |
| set_buddy_order(page, order); |
| |
| if (fpi_flags & FPI_TO_TAIL) |
| to_tail = true; |
| else if (is_shuffle_order(order)) |
| to_tail = shuffle_pick_tail(); |
| else |
| to_tail = buddy_merge_likely(pfn, buddy_pfn, page, order); |
| |
| if (to_tail) |
| add_to_free_list_tail(page, zone, order, migratetype); |
| else |
| add_to_free_list(page, zone, order, migratetype); |
| |
| /* Notify page reporting subsystem of freed page */ |
| if (!(fpi_flags & FPI_SKIP_REPORT_NOTIFY)) |
| page_reporting_notify_free(order); |
| } |
| |
| /** |
| * split_free_page() -- split a free page at split_pfn_offset |
| * @free_page: the original free page |
| * @order: the order of the page |
| * @split_pfn_offset: split offset within the page |
| * |
| * Return -ENOENT if the free page is changed, otherwise 0 |
| * |
| * It is used when the free page crosses two pageblocks with different migratetypes |
| * at split_pfn_offset within the page. The split free page will be put into |
| * separate migratetype lists afterwards. Otherwise, the function achieves |
| * nothing. |
| */ |
| int split_free_page(struct page *free_page, |
| unsigned int order, unsigned long split_pfn_offset) |
| { |
| struct zone *zone = page_zone(free_page); |
| unsigned long free_page_pfn = page_to_pfn(free_page); |
| unsigned long pfn; |
| unsigned long flags; |
| int free_page_order; |
| int mt; |
| int ret = 0; |
| |
| if (split_pfn_offset == 0) |
| return ret; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| |
| if (!PageBuddy(free_page) || buddy_order(free_page) != order) { |
| ret = -ENOENT; |
| goto out; |
| } |
| |
| mt = get_pageblock_migratetype(free_page); |
| if (likely(!is_migrate_isolate(mt))) |
| __mod_zone_freepage_state(zone, -(1UL << order), mt); |
| |
| del_page_from_free_list(free_page, zone, order); |
| for (pfn = free_page_pfn; |
| pfn < free_page_pfn + (1UL << order);) { |
| int mt = get_pfnblock_migratetype(pfn_to_page(pfn), pfn); |
| |
| free_page_order = min_t(unsigned int, |
| pfn ? __ffs(pfn) : order, |
| __fls(split_pfn_offset)); |
| __free_one_page(pfn_to_page(pfn), pfn, zone, free_page_order, |
| mt, FPI_NONE); |
| pfn += 1UL << free_page_order; |
| split_pfn_offset -= (1UL << free_page_order); |
| /* we have done the first part, now switch to second part */ |
| if (split_pfn_offset == 0) |
| split_pfn_offset = (1UL << order) - (pfn - free_page_pfn); |
| } |
| out: |
| spin_unlock_irqrestore(&zone->lock, flags); |
| return ret; |
| } |
| /* |
| * A bad page could be due to a number of fields. Instead of multiple branches, |
| * try and check multiple fields with one check. The caller must do a detailed |
| * check if necessary. |
| */ |
| static inline bool page_expected_state(struct page *page, |
| unsigned long check_flags) |
| { |
| if (unlikely(atomic_read(&page->_mapcount) != -1)) |
| return false; |
| |
| if (unlikely((unsigned long)page->mapping | |
| page_ref_count(page) | |
| #ifdef CONFIG_MEMCG |
| page->memcg_data | |
| #endif |
| (page->flags & check_flags))) |
| return false; |
| |
| return true; |
| } |
| |
| static const char *page_bad_reason(struct page *page, unsigned long flags) |
| { |
| const char *bad_reason = NULL; |
| |
| if (unlikely(atomic_read(&page->_mapcount) != -1)) |
| bad_reason = "nonzero mapcount"; |
| if (unlikely(page->mapping != NULL)) |
| bad_reason = "non-NULL mapping"; |
| if (unlikely(page_ref_count(page) != 0)) |
| bad_reason = "nonzero _refcount"; |
| if (unlikely(page->flags & flags)) { |
| if (flags == PAGE_FLAGS_CHECK_AT_PREP) |
| bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set"; |
| else |
| bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set"; |
| } |
| #ifdef CONFIG_MEMCG |
| if (unlikely(page->memcg_data)) |
| bad_reason = "page still charged to cgroup"; |
| #endif |
| return bad_reason; |
| } |
| |
| static void free_page_is_bad_report(struct page *page) |
| { |
| bad_page(page, |
| page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); |
| } |
| |
| static inline bool free_page_is_bad(struct page *page) |
| { |
| if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) |
| return false; |
| |
| /* Something has gone sideways, find it */ |
| free_page_is_bad_report(page); |
| return true; |
| } |
| |
| static int free_tail_pages_check(struct page *head_page, struct page *page) |
| { |
| int ret = 1; |
| |
| /* |
| * We rely page->lru.next never has bit 0 set, unless the page |
| * is PageTail(). Let's make sure that's true even for poisoned ->lru. |
| */ |
| BUILD_BUG_ON((unsigned long)LIST_POISON1 & 1); |
| |
| if (!IS_ENABLED(CONFIG_DEBUG_VM)) { |
| ret = 0; |
| goto out; |
| } |
| switch (page - head_page) { |
| case 1: |
| /* the first tail page: ->mapping may be compound_mapcount() */ |
| if (unlikely(compound_mapcount(page))) { |
| bad_page(page, "nonzero compound_mapcount"); |
| goto out; |
| } |
| break; |
| case 2: |
| /* |
| * the second tail page: ->mapping is |
| * deferred_list.next -- ignore value. |
| */ |
| break; |
| default: |
| if (page->mapping != TAIL_MAPPING) { |
| bad_page(page, "corrupted mapping in tail page"); |
| goto out; |
| } |
| break; |
| } |
| if (unlikely(!PageTail(page))) { |
| bad_page(page, "PageTail not set"); |
| goto out; |
| } |
| if (unlikely(compound_head(page) != head_page)) { |
| bad_page(page, "compound_head not consistent"); |
| goto out; |
| } |
| ret = 0; |
| out: |
| page->mapping = NULL; |
| clear_compound_head(page); |
| return ret; |
| } |
| |
| /* |
| * Skip KASAN memory poisoning when either: |
| * |
| * 1. Deferred memory initialization has not yet completed, |
| * see the explanation below. |
| * 2. Skipping poisoning is requested via FPI_SKIP_KASAN_POISON, |
| * see the comment next to it. |
| * 3. Skipping poisoning is requested via __GFP_SKIP_KASAN_POISON, |
| * see the comment next to it. |
| * |
| * Poisoning pages during deferred memory init will greatly lengthen the |
| * process and cause problem in large memory systems as the deferred pages |
| * initialization is done with interrupt disabled. |
| * |
| * Assuming that there will be no reference to those newly initialized |
| * pages before they are ever allocated, this should have no effect on |
| * KASAN memory tracking as the poison will be properly inserted at page |
| * allocation time. The only corner case is when pages are allocated by |
| * on-demand allocation and then freed again before the deferred pages |
| * initialization is done, but this is not likely to happen. |
| */ |
| static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) |
| { |
| return deferred_pages_enabled() || |
| (!IS_ENABLED(CONFIG_KASAN_GENERIC) && |
| (fpi_flags & FPI_SKIP_KASAN_POISON)) || |
| PageSkipKASanPoison(page); |
| } |
| |
| static void kernel_init_pages(struct page *page, int numpages) |
| { |
| int i; |
| |
| /* s390's use of memset() could override KASAN redzones. */ |
| kasan_disable_current(); |
| for (i = 0; i < numpages; i++) |
| clear_highpage_kasan_tagged(page + i); |
| kasan_enable_current(); |
| } |
| |
| static __always_inline bool free_pages_prepare(struct page *page, |
| unsigned int order, bool check_free, fpi_t fpi_flags) |
| { |
| int bad = 0; |
| bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); |
| bool init = want_init_on_free(); |
| |
| VM_BUG_ON_PAGE(PageTail(page), page); |
| |
| trace_mm_page_free(page, order); |
| kmsan_free_page(page, order); |
| |
| if (unlikely(PageHWPoison(page)) && !order) { |
| /* |
| * Do not let hwpoison pages hit pcplists/buddy |
| * Untie memcg state and reset page's owner |
| */ |
| if (memcg_kmem_enabled() && PageMemcgKmem(page)) |
| __memcg_kmem_uncharge_page(page, order); |
| reset_page_owner(page, order); |
| page_table_check_free(page, order); |
| return false; |
| } |
| |
| /* |
| * Check tail pages before head page information is cleared to |
| * avoid checking PageCompound for order-0 pages. |
| */ |
| if (unlikely(order)) { |
| bool compound = PageCompound(page); |
| int i; |
| |
| VM_BUG_ON_PAGE(compound && compound_order(page) != order, page); |
| |
| if (compound) { |
| ClearPageDoubleMap(page); |
| ClearPageHasHWPoisoned(page); |
| } |
| for (i = 1; i < (1 << order); i++) { |
| if (compound) |
| bad += free_tail_pages_check(page, page + i); |
| if (unlikely(free_page_is_bad(page + i))) { |
| bad++; |
| continue; |
| } |
| (page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
| } |
| } |
| if (PageMappingFlags(page)) |
| page->mapping = NULL; |
| if (memcg_kmem_enabled() && PageMemcgKmem(page)) |
| __memcg_kmem_uncharge_page(page, order); |
| if (check_free && free_page_is_bad(page)) |
| bad++; |
| if (bad) |
| return false; |
| |
| page_cpupid_reset_last(page); |
| page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; |
| reset_page_owner(page, order); |
| page_table_check_free(page, order); |
| |
| if (!PageHighMem(page)) { |
| debug_check_no_locks_freed(page_address(page), |
| PAGE_SIZE << order); |
| debug_check_no_obj_freed(page_address(page), |
| PAGE_SIZE << order); |
| } |
| |
| kernel_poison_pages(page, 1 << order); |
| |
| /* |
| * As memory initialization might be integrated into KASAN, |
| * KASAN poisoning and memory initialization code must be |
| * kept together to avoid discrepancies in behavior. |
| * |
| * With hardware tag-based KASAN, memory tags must be set before the |
| * page becomes unavailable via debug_pagealloc or arch_free_page. |
| */ |
| if (!skip_kasan_poison) { |
| kasan_poison_pages(page, order, init); |
| |
| /* Memory is already initialized if KASAN did it internally. */ |
| if (kasan_has_integrated_init()) |
| init = false; |
| } |
| if (init) |
| kernel_init_pages(page, 1 << order); |
| |
| /* |
| * arch_free_page() can make the page's contents inaccessible. s390 |
| * does this. So nothing which can access the page's contents should |
| * happen after this. |
| */ |
| arch_free_page(page, order); |
| |
| debug_pagealloc_unmap_pages(page, 1 << order); |
| |
| return true; |
| } |
| |
| #ifdef CONFIG_DEBUG_VM |
| /* |
| * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed |
| * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when |
| * moved from pcp lists to free lists. |
| */ |
| static bool free_pcp_prepare(struct page *page, unsigned int order) |
| { |
| return free_pages_prepare(page, order, true, FPI_NONE); |
| } |
| |
| /* return true if this page has an inappropriate state */ |
| static bool bulkfree_pcp_prepare(struct page *page) |
| { |
| if (debug_pagealloc_enabled_static()) |
| return free_page_is_bad(page); |
| else |
| return false; |
| } |
| #else |
| /* |
| * With DEBUG_VM disabled, order-0 pages being freed are checked only when |
| * moving from pcp lists to free list in order to reduce overhead. With |
| * debug_pagealloc enabled, they are checked also immediately when being freed |
| * to the pcp lists. |
| */ |
| static bool free_pcp_prepare(struct page *page, unsigned int order) |
| { |
| if (debug_pagealloc_enabled_static()) |
| return free_pages_prepare(page, order, true, FPI_NONE); |
| else |
| return free_pages_prepare(page, order, false, FPI_NONE); |
| } |
| |
| static bool bulkfree_pcp_prepare(struct page *page) |
| { |
| return free_page_is_bad(page); |
| } |
| #endif /* CONFIG_DEBUG_VM */ |
| |
| /* |
| * Frees a number of pages from the PCP lists |
| * Assumes all pages on list are in same zone. |
| * count is the number of pages to free. |
| */ |
| static void free_pcppages_bulk(struct zone *zone, int count, |
| struct per_cpu_pages *pcp, |
| int pindex) |
| { |
| unsigned long flags; |
| int min_pindex = 0; |
| int max_pindex = NR_PCP_LISTS - 1; |
| unsigned int order; |
| bool isolated_pageblocks; |
| struct page *page; |
| |
| /* |
| * Ensure proper count is passed which otherwise would stuck in the |
| * below while (list_empty(list)) loop. |
| */ |
| count = min(pcp->count, count); |
| |
| /* Ensure requested pindex is drained first. */ |
| pindex = pindex - 1; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| isolated_pageblocks = has_isolate_pageblock(zone); |
| |
| while (count > 0) { |
| struct list_head *list; |
| int nr_pages; |
| |
| /* Remove pages from lists in a round-robin fashion. */ |
| do { |
| if (++pindex > max_pindex) |
| pindex = min_pindex; |
| list = &pcp->lists[pindex]; |
| if (!list_empty(list)) |
| break; |
| |
| if (pindex == max_pindex) |
| max_pindex--; |
| if (pindex == min_pindex) |
| min_pindex++; |
| } while (1); |
| |
| order = pindex_to_order(pindex); |
| nr_pages = 1 << order; |
| do { |
| int mt; |
| |
| page = list_last_entry(list, struct page, pcp_list); |
| mt = get_pcppage_migratetype(page); |
| |
| /* must delete to avoid corrupting pcp list */ |
| list_del(&page->pcp_list); |
| count -= nr_pages; |
| pcp->count -= nr_pages; |
| |
| if (bulkfree_pcp_prepare(page)) |
| continue; |
| |
| /* MIGRATE_ISOLATE page should not go to pcplists */ |
| VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); |
| /* Pageblock could have been isolated meanwhile */ |
| if (unlikely(isolated_pageblocks)) |
| mt = get_pageblock_migratetype(page); |
| |
| __free_one_page(page, page_to_pfn(page), zone, order, mt, FPI_NONE); |
| trace_mm_page_pcpu_drain(page, order, mt); |
| } while (count > 0 && !list_empty(list)); |
| } |
| |
| spin_unlock_irqrestore(&zone->lock, flags); |
| } |
| |
| static void free_one_page(struct zone *zone, |
| struct page *page, unsigned long pfn, |
| unsigned int order, |
| int migratetype, fpi_t fpi_flags) |
| { |
| unsigned long flags; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| if (unlikely(has_isolate_pageblock(zone) || |
| is_migrate_isolate(migratetype))) { |
| migratetype = get_pfnblock_migratetype(page, pfn); |
| } |
| __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); |
| spin_unlock_irqrestore(&zone->lock, flags); |
| } |
| |
| static void __meminit __init_single_page(struct page *page, unsigned long pfn, |
| unsigned long zone, int nid) |
| { |
| mm_zero_struct_page(page); |
| set_page_links(page, zone, nid, pfn); |
| init_page_count(page); |
| page_mapcount_reset(page); |
| page_cpupid_reset_last(page); |
| page_kasan_tag_reset(page); |
| |
| INIT_LIST_HEAD(&page->lru); |
| #ifdef WANT_PAGE_VIRTUAL |
| /* The shift won't overflow because ZONE_NORMAL is below 4G. */ |
| if (!is_highmem_idx(zone)) |
| set_page_address(page, __va(pfn << PAGE_SHIFT)); |
| #endif |
| } |
| |
| #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
| static void __meminit init_reserved_page(unsigned long pfn) |
| { |
| pg_data_t *pgdat; |
| int nid, zid; |
| |
| if (!early_page_uninitialised(pfn)) |
| return; |
| |
| nid = early_pfn_to_nid(pfn); |
| pgdat = NODE_DATA(nid); |
| |
| for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
| struct zone *zone = &pgdat->node_zones[zid]; |
| |
| if (zone_spans_pfn(zone, pfn)) |
| break; |
| } |
| __init_single_page(pfn_to_page(pfn), pfn, zid, nid); |
| } |
| #else |
| static inline void init_reserved_page(unsigned long pfn) |
| { |
| } |
| #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ |
| |
| /* |
| * Initialised pages do not have PageReserved set. This function is |
| * called for each range allocated by the bootmem allocator and |
| * marks the pages PageReserved. The remaining valid pages are later |
| * sent to the buddy page allocator. |
| */ |
| void __meminit reserve_bootmem_region(phys_addr_t start, phys_addr_t end) |
| { |
| unsigned long start_pfn = PFN_DOWN(start); |
| unsigned long end_pfn = PFN_UP(end); |
| |
| for (; start_pfn < end_pfn; start_pfn++) { |
| if (pfn_valid(start_pfn)) { |
| struct page *page = pfn_to_page(start_pfn); |
| |
| init_reserved_page(start_pfn); |
| |
| /* Avoid false-positive PageTail() */ |
| INIT_LIST_HEAD(&page->lru); |
| |
| /* |
| * no need for atomic set_bit because the struct |
| * page is not visible yet so nobody should |
| * access it yet. |
| */ |
| __SetPageReserved(page); |
| } |
| } |
| } |
| |
| static void __free_pages_ok(struct page *page, unsigned int order, |
| fpi_t fpi_flags) |
| { |
| unsigned long flags; |
| int migratetype; |
| unsigned long pfn = page_to_pfn(page); |
| struct zone *zone = page_zone(page); |
| |
| if (!free_pages_prepare(page, order, true, fpi_flags)) |
| return; |
| |
| migratetype = get_pfnblock_migratetype(page, pfn); |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| if (unlikely(has_isolate_pageblock(zone) || |
| is_migrate_isolate(migratetype))) { |
| migratetype = get_pfnblock_migratetype(page, pfn); |
| } |
| __free_one_page(page, pfn, zone, order, migratetype, fpi_flags); |
| spin_unlock_irqrestore(&zone->lock, flags); |
| |
| __count_vm_events(PGFREE, 1 << order); |
| } |
| |
| void __free_pages_core(struct page *page, unsigned int order) |
| { |
| unsigned int nr_pages = 1 << order; |
| struct page *p = page; |
| unsigned int loop; |
| |
| /* |
| * When initializing the memmap, __init_single_page() sets the refcount |
| * of all pages to 1 ("allocated"/"not free"). We have to set the |
| * refcount of all involved pages to 0. |
| */ |
| prefetchw(p); |
| for (loop = 0; loop < (nr_pages - 1); loop++, p++) { |
| prefetchw(p + 1); |
| __ClearPageReserved(p); |
| set_page_count(p, 0); |
| } |
| __ClearPageReserved(p); |
| set_page_count(p, 0); |
| |
| atomic_long_add(nr_pages, &page_zone(page)->managed_pages); |
| |
| if (page_contains_unaccepted(page, order)) { |
| if (order == MAX_ORDER && __free_unaccepted(page)) |
| return; |
| |
| accept_page(page, order); |
| } |
| |
| /* |
| * Bypass PCP and place fresh pages right to the tail, primarily |
| * relevant for memory onlining. |
| */ |
| __free_pages_ok(page, order, FPI_TO_TAIL | FPI_SKIP_KASAN_POISON); |
| } |
| |
| #ifdef CONFIG_NUMA |
| |
| /* |
| * During memory init memblocks map pfns to nids. The search is expensive and |
| * this caches recent lookups. The implementation of __early_pfn_to_nid |
| * treats start/end as pfns. |
| */ |
| struct mminit_pfnnid_cache { |
| unsigned long last_start; |
| unsigned long last_end; |
| int last_nid; |
| }; |
| |
| static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata; |
| |
| /* |
| * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. |
| */ |
| static int __meminit __early_pfn_to_nid(unsigned long pfn, |
| struct mminit_pfnnid_cache *state) |
| { |
| unsigned long start_pfn, end_pfn; |
| int nid; |
| |
| if (state->last_start <= pfn && pfn < state->last_end) |
| return state->last_nid; |
| |
| nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn); |
| if (nid != NUMA_NO_NODE) { |
| state->last_start = start_pfn; |
| state->last_end = end_pfn; |
| state->last_nid = nid; |
| } |
| |
| return nid; |
| } |
| |
| int __meminit early_pfn_to_nid(unsigned long pfn) |
| { |
| static DEFINE_SPINLOCK(early_pfn_lock); |
| int nid; |
| |
| spin_lock(&early_pfn_lock); |
| nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache); |
| if (nid < 0) |
| nid = first_online_node; |
| spin_unlock(&early_pfn_lock); |
| |
| return nid; |
| } |
| #endif /* CONFIG_NUMA */ |
| |
| void __init memblock_free_pages(struct page *page, unsigned long pfn, |
| unsigned int order) |
| { |
| if (early_page_uninitialised(pfn)) |
| return; |
| if (!kmsan_memblock_free_pages(page, order)) { |
| /* KMSAN will take care of these pages. */ |
| return; |
| } |
| __free_pages_core(page, order); |
| } |
| |
| /* |
| * Check that the whole (or subset of) a pageblock given by the interval of |
| * [start_pfn, end_pfn) is valid and within the same zone, before scanning it |
| * with the migration of free compaction scanner. |
| * |
| * Return struct page pointer of start_pfn, or NULL if checks were not passed. |
| * |
| * It's possible on some configurations to have a setup like node0 node1 node0 |
| * i.e. it's possible that all pages within a zones range of pages do not |
| * belong to a single zone. We assume that a border between node0 and node1 |
| * can occur within a single pageblock, but not a node0 node1 node0 |
| * interleaving within a single pageblock. It is therefore sufficient to check |
| * the first and last page of a pageblock and avoid checking each individual |
| * page in a pageblock. |
| */ |
| struct page *__pageblock_pfn_to_page(unsigned long start_pfn, |
| unsigned long end_pfn, struct zone *zone) |
| { |
| struct page *start_page; |
| struct page *end_page; |
| |
| /* end_pfn is one past the range we are checking */ |
| end_pfn--; |
| |
| if (!pfn_valid(start_pfn) || !pfn_valid(end_pfn)) |
| return NULL; |
| |
| start_page = pfn_to_online_page(start_pfn); |
| if (!start_page) |
| return NULL; |
| |
| if (page_zone(start_page) != zone) |
| return NULL; |
| |
| end_page = pfn_to_page(end_pfn); |
| |
| /* This gives a shorter code than deriving page_zone(end_page) */ |
| if (page_zone_id(start_page) != page_zone_id(end_page)) |
| return NULL; |
| |
| return start_page; |
| } |
| |
| void set_zone_contiguous(struct zone *zone) |
| { |
| unsigned long block_start_pfn = zone->zone_start_pfn; |
| unsigned long block_end_pfn; |
| |
| block_end_pfn = pageblock_end_pfn(block_start_pfn); |
| for (; block_start_pfn < zone_end_pfn(zone); |
| block_start_pfn = block_end_pfn, |
| block_end_pfn += pageblock_nr_pages) { |
| |
| block_end_pfn = min(block_end_pfn, zone_end_pfn(zone)); |
| |
| if (!__pageblock_pfn_to_page(block_start_pfn, |
| block_end_pfn, zone)) |
| return; |
| cond_resched(); |
| } |
| |
| /* We confirm that there is no hole */ |
| zone->contiguous = true; |
| } |
| |
| void clear_zone_contiguous(struct zone *zone) |
| { |
| zone->contiguous = false; |
| } |
| |
| #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
| static void __init deferred_free_range(unsigned long pfn, |
| unsigned long nr_pages) |
| { |
| struct page *page; |
| unsigned long i; |
| |
| if (!nr_pages) |
| return; |
| |
| page = pfn_to_page(pfn); |
| |
| /* Free a large naturally-aligned chunk if possible */ |
| if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) { |
| set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
| __free_pages_core(page, pageblock_order); |
| return; |
| } |
| |
| /* Accept chunks smaller than MAX_ORDER upfront */ |
| accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages)); |
| |
| for (i = 0; i < nr_pages; i++, page++, pfn++) { |
| if (pageblock_aligned(pfn)) |
| set_pageblock_migratetype(page, MIGRATE_MOVABLE); |
| __free_pages_core(page, 0); |
| } |
| } |
| |
| /* Completion tracking for deferred_init_memmap() threads */ |
| static atomic_t pgdat_init_n_undone __initdata; |
| static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp); |
| |
| static inline void __init pgdat_init_report_one_done(void) |
| { |
| if (atomic_dec_and_test(&pgdat_init_n_undone)) |
| complete(&pgdat_init_all_done_comp); |
| } |
| |
| /* |
| * Returns true if page needs to be initialized or freed to buddy allocator. |
| * |
| * We check if a current large page is valid by only checking the validity |
| * of the head pfn. |
| */ |
| static inline bool __init deferred_pfn_valid(unsigned long pfn) |
| { |
| if (pageblock_aligned(pfn) && !pfn_valid(pfn)) |
| return false; |
| return true; |
| } |
| |
| /* |
| * Free pages to buddy allocator. Try to free aligned pages in |
| * pageblock_nr_pages sizes. |
| */ |
| static void __init deferred_free_pages(unsigned long pfn, |
| unsigned long end_pfn) |
| { |
| unsigned long nr_free = 0; |
| |
| for (; pfn < end_pfn; pfn++) { |
| if (!deferred_pfn_valid(pfn)) { |
| deferred_free_range(pfn - nr_free, nr_free); |
| nr_free = 0; |
| } else if (pageblock_aligned(pfn)) { |
| deferred_free_range(pfn - nr_free, nr_free); |
| nr_free = 1; |
| } else { |
| nr_free++; |
| } |
| } |
| /* Free the last block of pages to allocator */ |
| deferred_free_range(pfn - nr_free, nr_free); |
| } |
| |
| /* |
| * Initialize struct pages. We minimize pfn page lookups and scheduler checks |
| * by performing it only once every pageblock_nr_pages. |
| * Return number of pages initialized. |
| */ |
| static unsigned long __init deferred_init_pages(struct zone *zone, |
| unsigned long pfn, |
| unsigned long end_pfn) |
| { |
| int nid = zone_to_nid(zone); |
| unsigned long nr_pages = 0; |
| int zid = zone_idx(zone); |
| struct page *page = NULL; |
| |
| for (; pfn < end_pfn; pfn++) { |
| if (!deferred_pfn_valid(pfn)) { |
| page = NULL; |
| continue; |
| } else if (!page || pageblock_aligned(pfn)) { |
| page = pfn_to_page(pfn); |
| } else { |
| page++; |
| } |
| __init_single_page(page, pfn, zid, nid); |
| nr_pages++; |
| } |
| return (nr_pages); |
| } |
| |
| /* |
| * This function is meant to pre-load the iterator for the zone init. |
| * Specifically it walks through the ranges until we are caught up to the |
| * first_init_pfn value and exits there. If we never encounter the value we |
| * return false indicating there are no valid ranges left. |
| */ |
| static bool __init |
| deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone, |
| unsigned long *spfn, unsigned long *epfn, |
| unsigned long first_init_pfn) |
| { |
| u64 j; |
| |
| /* |
| * Start out by walking through the ranges in this zone that have |
| * already been initialized. We don't need to do anything with them |
| * so we just need to flush them out of the system. |
| */ |
| for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) { |
| if (*epfn <= first_init_pfn) |
| continue; |
| if (*spfn < first_init_pfn) |
| *spfn = first_init_pfn; |
| *i = j; |
| return true; |
| } |
| |
| return false; |
| } |
| |
| /* |
| * Initialize and free pages. We do it in two loops: first we initialize |
| * struct page, then free to buddy allocator, because while we are |
| * freeing pages we can access pages that are ahead (computing buddy |
| * page in __free_one_page()). |
| * |
| * In order to try and keep some memory in the cache we have the loop |
| * broken along max page order boundaries. This way we will not cause |
| * any issues with the buddy page computation. |
| */ |
| static unsigned long __init |
| deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn, |
| unsigned long *end_pfn) |
| { |
| unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES); |
| unsigned long spfn = *start_pfn, epfn = *end_pfn; |
| unsigned long nr_pages = 0; |
| u64 j = *i; |
| |
| /* First we loop through and initialize the page values */ |
| for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) { |
| unsigned long t; |
| |
| if (mo_pfn <= *start_pfn) |
| break; |
| |
| t = min(mo_pfn, *end_pfn); |
| nr_pages += deferred_init_pages(zone, *start_pfn, t); |
| |
| if (mo_pfn < *end_pfn) { |
| *start_pfn = mo_pfn; |
| break; |
| } |
| } |
| |
| /* Reset values and now loop through freeing pages as needed */ |
| swap(j, *i); |
| |
| for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) { |
| unsigned long t; |
| |
| if (mo_pfn <= spfn) |
| break; |
| |
| t = min(mo_pfn, epfn); |
| deferred_free_pages(spfn, t); |
| |
| if (mo_pfn <= epfn) |
| break; |
| } |
| |
| return nr_pages; |
| } |
| |
| static void __init |
| deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn, |
| void *arg) |
| { |
| unsigned long spfn, epfn; |
| struct zone *zone = arg; |
| u64 i; |
| |
| deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn); |
| |
| /* |
| * Initialize and free pages in MAX_ORDER sized increments so that we |
| * can avoid introducing any issues with the buddy allocator. |
| */ |
| while (spfn < end_pfn) { |
| deferred_init_maxorder(&i, zone, &spfn, &epfn); |
| cond_resched(); |
| } |
| } |
| |
| /* An arch may override for more concurrency. */ |
| __weak int __init |
| deferred_page_init_max_threads(const struct cpumask *node_cpumask) |
| { |
| return 1; |
| } |
| |
| /* Initialise remaining memory on a node */ |
| static int __init deferred_init_memmap(void *data) |
| { |
| pg_data_t *pgdat = data; |
| const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); |
| unsigned long spfn = 0, epfn = 0; |
| unsigned long first_init_pfn, flags; |
| unsigned long start = jiffies; |
| struct zone *zone; |
| int zid, max_threads; |
| u64 i; |
| |
| /* Bind memory initialisation thread to a local node if possible */ |
| if (!cpumask_empty(cpumask)) |
| set_cpus_allowed_ptr(current, cpumask); |
| |
| pgdat_resize_lock(pgdat, &flags); |
| first_init_pfn = pgdat->first_deferred_pfn; |
| if (first_init_pfn == ULONG_MAX) { |
| pgdat_resize_unlock(pgdat, &flags); |
| pgdat_init_report_one_done(); |
| return 0; |
| } |
| |
| /* Sanity check boundaries */ |
| BUG_ON(pgdat->first_deferred_pfn < pgdat->node_start_pfn); |
| BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat)); |
| pgdat->first_deferred_pfn = ULONG_MAX; |
| |
| /* |
| * Once we unlock here, the zone cannot be grown anymore, thus if an |
| * interrupt thread must allocate this early in boot, zone must be |
| * pre-grown prior to start of deferred page initialization. |
| */ |
| pgdat_resize_unlock(pgdat, &flags); |
| |
| /* Only the highest zone is deferred so find it */ |
| for (zid = 0; zid < MAX_NR_ZONES; zid++) { |
| zone = pgdat->node_zones + zid; |
| if (first_init_pfn < zone_end_pfn(zone)) |
| break; |
| } |
| |
| /* If the zone is empty somebody else may have cleared out the zone */ |
| if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, |
| first_init_pfn)) |
| goto zone_empty; |
| |
| max_threads = deferred_page_init_max_threads(cpumask); |
| |
| while (spfn < epfn) { |
| unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION); |
| struct padata_mt_job job = { |
| .thread_fn = deferred_init_memmap_chunk, |
| .fn_arg = zone, |
| .start = spfn, |
| .size = epfn_align - spfn, |
| .align = PAGES_PER_SECTION, |
| .min_chunk = PAGES_PER_SECTION, |
| .max_threads = max_threads, |
| }; |
| |
| padata_do_multithreaded(&job); |
| deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, |
| epfn_align); |
| } |
| zone_empty: |
| /* Sanity check that the next zone really is unpopulated */ |
| WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone)); |
| |
| pr_info("node %d deferred pages initialised in %ums\n", |
| pgdat->node_id, jiffies_to_msecs(jiffies - start)); |
| |
| pgdat_init_report_one_done(); |
| return 0; |
| } |
| |
| /* |
| * If this zone has deferred pages, try to grow it by initializing enough |
| * deferred pages to satisfy the allocation specified by order, rounded up to |
| * the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments |
| * of SECTION_SIZE bytes by initializing struct pages in increments of |
| * PAGES_PER_SECTION * sizeof(struct page) bytes. |
| * |
| * Return true when zone was grown, otherwise return false. We return true even |
| * when we grow less than requested, to let the caller decide if there are |
| * enough pages to satisfy the allocation. |
| * |
| * Note: We use noinline because this function is needed only during boot, and |
| * it is called from a __ref function _deferred_grow_zone. This way we are |
| * making sure that it is not inlined into permanent text section. |
| */ |
| static noinline bool __init |
| deferred_grow_zone(struct zone *zone, unsigned int order) |
| { |
| unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION); |
| pg_data_t *pgdat = zone->zone_pgdat; |
| unsigned long first_deferred_pfn = pgdat->first_deferred_pfn; |
| unsigned long spfn, epfn, flags; |
| unsigned long nr_pages = 0; |
| u64 i; |
| |
| /* Only the last zone may have deferred pages */ |
| if (zone_end_pfn(zone) != pgdat_end_pfn(pgdat)) |
| return false; |
| |
| pgdat_resize_lock(pgdat, &flags); |
| |
| /* |
| * If someone grew this zone while we were waiting for spinlock, return |
| * true, as there might be enough pages already. |
| */ |
| if (first_deferred_pfn != pgdat->first_deferred_pfn) { |
| pgdat_resize_unlock(pgdat, &flags); |
| return true; |
| } |
| |
| /* If the zone is empty somebody else may have cleared out the zone */ |
| if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, |
| first_deferred_pfn)) { |
| pgdat->first_deferred_pfn = ULONG_MAX; |
| pgdat_resize_unlock(pgdat, &flags); |
| /* Retry only once. */ |
| return first_deferred_pfn != ULONG_MAX; |
| } |
| |
| /* |
| * Initialize and free pages in MAX_ORDER sized increments so |
| * that we can avoid introducing any issues with the buddy |
| * allocator. |
| */ |
| while (spfn < epfn) { |
| /* update our first deferred PFN for this section */ |
| first_deferred_pfn = spfn; |
| |
| nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn); |
| touch_nmi_watchdog(); |
| |
| /* We should only stop along section boundaries */ |
| if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION) |
| continue; |
| |
| /* If our quota has been met we can stop here */ |
| if (nr_pages >= nr_pages_needed) |
| break; |
| } |
| |
| pgdat->first_deferred_pfn = spfn; |
| pgdat_resize_unlock(pgdat, &flags); |
| |
| return nr_pages > 0; |
| } |
| |
| /* |
| * deferred_grow_zone() is __init, but it is called from |
| * get_page_from_freelist() during early boot until deferred_pages permanently |
| * disables this call. This is why we have refdata wrapper to avoid warning, |
| * and to ensure that the function body gets unloaded. |
| */ |
| static bool __ref |
| _deferred_grow_zone(struct zone *zone, unsigned int order) |
| { |
| return deferred_grow_zone(zone, order); |
| } |
| |
| #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ |
| |
| void __init page_alloc_init_late(void) |
| { |
| struct zone *zone; |
| int nid; |
| |
| #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT |
| |
| /* There will be num_node_state(N_MEMORY) threads */ |
| atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY)); |
| for_each_node_state(nid, N_MEMORY) { |
| kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid); |
| } |
| |
| /* Block until all are initialised */ |
| wait_for_completion(&pgdat_init_all_done_comp); |
| |
| /* |
| * We initialized the rest of the deferred pages. Permanently disable |
| * on-demand struct page initialization. |
| */ |
| static_branch_disable(&deferred_pages); |
| |
| /* Reinit limits that are based on free pages after the kernel is up */ |
| files_maxfiles_init(); |
| #endif |
| |
| buffer_init(); |
| |
| /* Discard memblock private memory */ |
| memblock_discard(); |
| |
| for_each_node_state(nid, N_MEMORY) |
| shuffle_free_memory(NODE_DATA(nid)); |
| |
| for_each_populated_zone(zone) |
| set_zone_contiguous(zone); |
| } |
| |
| #ifdef CONFIG_CMA |
| /* Free whole pageblock and set its migration type to MIGRATE_CMA. */ |
| void __init init_cma_reserved_pageblock(struct page *page) |
| { |
| unsigned i = pageblock_nr_pages; |
| struct page *p = page; |
| |
| do { |
| __ClearPageReserved(p); |
| set_page_count(p, 0); |
| } while (++p, --i); |
| |
| set_pageblock_migratetype(page, MIGRATE_CMA); |
| set_page_refcounted(page); |
| __free_pages(page, pageblock_order); |
| |
| adjust_managed_page_count(page, pageblock_nr_pages); |
| page_zone(page)->cma_pages += pageblock_nr_pages; |
| } |
| #endif |
| |
| /* |
| * The order of subdivision here is critical for the IO subsystem. |
| * Please do not alter this order without good reasons and regression |
| * testing. Specifically, as large blocks of memory are subdivided, |
| * the order in which smaller blocks are delivered depends on the order |
| * they're subdivided in this function. This is the primary factor |
| * influencing the order in which pages are delivered to the IO |
| * subsystem according to empirical testing, and this is also justified |
| * by considering the behavior of a buddy system containing a single |
| * large block of memory acted on by a series of small allocations. |
| * This behavior is a critical factor in sglist merging's success. |
| * |
| * -- nyc |
| */ |
| static inline void expand(struct zone *zone, struct page *page, |
| int low, int high, int migratetype) |
| { |
| unsigned long size = 1 << high; |
| |
| while (high > low) { |
| high--; |
| size >>= 1; |
| VM_BUG_ON_PAGE(bad_range(zone, &page[size]), &page[size]); |
| |
| /* |
| * Mark as guard pages (or page), that will allow to |
| * merge back to allocator when buddy will be freed. |
| * Corresponding page table entries will not be touched, |
| * pages will stay not present in virtual address space |
| */ |
| if (set_page_guard(zone, &page[size], high, migratetype)) |
| continue; |
| |
| add_to_free_list(&page[size], zone, high, migratetype); |
| set_buddy_order(&page[size], high); |
| } |
| } |
| |
| static void check_new_page_bad(struct page *page) |
| { |
| if (unlikely(page->flags & __PG_HWPOISON)) { |
| /* Don't complain about hwpoisoned pages */ |
| page_mapcount_reset(page); /* remove PageBuddy */ |
| return; |
| } |
| |
| bad_page(page, |
| page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP)); |
| } |
| |
| /* |
| * This page is about to be returned from the page allocator |
| */ |
| static inline int check_new_page(struct page *page) |
| { |
| if (likely(page_expected_state(page, |
| PAGE_FLAGS_CHECK_AT_PREP|__PG_HWPOISON))) |
| return 0; |
| |
| check_new_page_bad(page); |
| return 1; |
| } |
| |
| static bool check_new_pages(struct page *page, unsigned int order) |
| { |
| int i; |
| for (i = 0; i < (1 << order); i++) { |
| struct page *p = page + i; |
| |
| if (unlikely(check_new_page(p))) |
| return true; |
| } |
| |
| return false; |
| } |
| |
| #ifdef CONFIG_DEBUG_VM |
| /* |
| * With DEBUG_VM enabled, order-0 pages are checked for expected state when |
| * being allocated from pcp lists. With debug_pagealloc also enabled, they are |
| * also checked when pcp lists are refilled from the free lists. |
| */ |
| static inline bool check_pcp_refill(struct page *page, unsigned int order) |
| { |
| if (debug_pagealloc_enabled_static()) |
| return check_new_pages(page, order); |
| else |
| return false; |
| } |
| |
| static inline bool check_new_pcp(struct page *page, unsigned int order) |
| { |
| return check_new_pages(page, order); |
| } |
| #else |
| /* |
| * With DEBUG_VM disabled, free order-0 pages are checked for expected state |
| * when pcp lists are being refilled from the free lists. With debug_pagealloc |
| * enabled, they are also checked when being allocated from the pcp lists. |
| */ |
| static inline bool check_pcp_refill(struct page *page, unsigned int order) |
| { |
| return check_new_pages(page, order); |
| } |
| static inline bool check_new_pcp(struct page *page, unsigned int order) |
| { |
| if (debug_pagealloc_enabled_static()) |
| return check_new_pages(page, order); |
| else |
| return false; |
| } |
| #endif /* CONFIG_DEBUG_VM */ |
| |
| static inline bool should_skip_kasan_unpoison(gfp_t flags) |
| { |
| /* Don't skip if a software KASAN mode is enabled. */ |
| if (IS_ENABLED(CONFIG_KASAN_GENERIC) || |
| IS_ENABLED(CONFIG_KASAN_SW_TAGS)) |
| return false; |
| |
| /* Skip, if hardware tag-based KASAN is not enabled. */ |
| if (!kasan_hw_tags_enabled()) |
| return true; |
| |
| /* |
| * With hardware tag-based KASAN enabled, skip if this has been |
| * requested via __GFP_SKIP_KASAN_UNPOISON. |
| */ |
| return flags & __GFP_SKIP_KASAN_UNPOISON; |
| } |
| |
| static inline bool should_skip_init(gfp_t flags) |
| { |
| /* Don't skip, if hardware tag-based KASAN is not enabled. */ |
| if (!kasan_hw_tags_enabled()) |
| return false; |
| |
| /* For hardware tag-based KASAN, skip if requested. */ |
| return (flags & __GFP_SKIP_ZERO); |
| } |
| |
| inline void post_alloc_hook(struct page *page, unsigned int order, |
| gfp_t gfp_flags) |
| { |
| bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags) && |
| !should_skip_init(gfp_flags); |
| bool init_tags = init && (gfp_flags & __GFP_ZEROTAGS); |
| int i; |
| |
| set_page_private(page, 0); |
| set_page_refcounted(page); |
| |
| arch_alloc_page(page, order); |
| debug_pagealloc_map_pages(page, 1 << order); |
| |
| /* |
| * Page unpoisoning must happen before memory initialization. |
| * Otherwise, the poison pattern will be overwritten for __GFP_ZERO |
| * allocations and the page unpoisoning code will complain. |
| */ |
| kernel_unpoison_pages(page, 1 << order); |
| |
| /* |
| * As memory initialization might be integrated into KASAN, |
| * KASAN unpoisoning and memory initializion code must be |
| * kept together to avoid discrepancies in behavior. |
| */ |
| |
| /* |
| * If memory tags should be zeroed (which happens only when memory |
| * should be initialized as well). |
| */ |
| if (init_tags) { |
| /* Initialize both memory and tags. */ |
| for (i = 0; i != 1 << order; ++i) |
| tag_clear_highpage(page + i); |
| |
| /* Note that memory is already initialized by the loop above. */ |
| init = false; |
| } |
| if (!should_skip_kasan_unpoison(gfp_flags)) { |
| /* Unpoison shadow memory or set memory tags. */ |
| kasan_unpoison_pages(page, order, init); |
| |
| /* Note that memory is already initialized by KASAN. */ |
| if (kasan_has_integrated_init()) |
| init = false; |
| } else { |
| /* Ensure page_address() dereferencing does not fault. */ |
| for (i = 0; i != 1 << order; ++i) |
| page_kasan_tag_reset(page + i); |
| } |
| /* If memory is still not initialized, do it now. */ |
| if (init) |
| kernel_init_pages(page, 1 << order); |
| /* Propagate __GFP_SKIP_KASAN_POISON to page flags. */ |
| if (kasan_hw_tags_enabled() && (gfp_flags & __GFP_SKIP_KASAN_POISON)) |
| SetPageSkipKASanPoison(page); |
| |
| set_page_owner(page, order, gfp_flags); |
| page_table_check_alloc(page, order); |
| } |
| |
| static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, |
| unsigned int alloc_flags) |
| { |
| post_alloc_hook(page, order, gfp_flags); |
| |
| if (order && (gfp_flags & __GFP_COMP)) |
| prep_compound_page(page, order); |
| |
| /* |
| * page is set pfmemalloc when ALLOC_NO_WATERMARKS was necessary to |
| * allocate the page. The expectation is that the caller is taking |
| * steps that will free more memory. The caller should avoid the page |
| * being used for !PFMEMALLOC purposes. |
| */ |
| if (alloc_flags & ALLOC_NO_WATERMARKS) |
| set_page_pfmemalloc(page); |
| else |
| clear_page_pfmemalloc(page); |
| } |
| |
| /* |
| * Go through the free lists for the given migratetype and remove |
| * the smallest available page from the freelists |
| */ |
| static __always_inline |
| struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, |
| int migratetype) |
| { |
| unsigned int current_order; |
| struct free_area *area; |
| struct page *page; |
| |
| /* Find a page of the appropriate size in the preferred list */ |
| for (current_order = order; current_order < MAX_ORDER; ++current_order) { |
| area = &(zone->free_area[current_order]); |
| page = get_page_from_free_area(area, migratetype); |
| if (!page) |
| continue; |
| del_page_from_free_list(page, zone, current_order); |
| expand(zone, page, order, current_order, migratetype); |
| set_pcppage_migratetype(page, migratetype); |
| trace_mm_page_alloc_zone_locked(page, order, migratetype, |
| pcp_allowed_order(order) && |
| migratetype < MIGRATE_PCPTYPES); |
| return page; |
| } |
| |
| return NULL; |
| } |
| |
| |
| /* |
| * This array describes the order lists are fallen back to when |
| * the free lists for the desirable migrate type are depleted |
| * |
| * The other migratetypes do not have fallbacks. |
| */ |
| static int fallbacks[MIGRATE_TYPES][3] = { |
| [MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, |
| [MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES }, |
| [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES }, |
| }; |
| |
| #ifdef CONFIG_CMA |
| static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, |
| unsigned int order) |
| { |
| return __rmqueue_smallest(zone, order, MIGRATE_CMA); |
| } |
| #else |
| static inline struct page *__rmqueue_cma_fallback(struct zone *zone, |
| unsigned int order) { return NULL; } |
| #endif |
| |
| /* |
| * Move the free pages in a range to the freelist tail of the requested type. |
| * Note that start_page and end_pages are not aligned on a pageblock |
| * boundary. If alignment is required, use move_freepages_block() |
| */ |
| static int move_freepages(struct zone *zone, |
| unsigned long start_pfn, unsigned long end_pfn, |
| int migratetype, int *num_movable) |
| { |
| struct page *page; |
| unsigned long pfn; |
| unsigned int order; |
| int pages_moved = 0; |
| |
| for (pfn = start_pfn; pfn <= end_pfn;) { |
| page = pfn_to_page(pfn); |
| if (!PageBuddy(page)) { |
| /* |
| * We assume that pages that could be isolated for |
| * migration are movable. But we don't actually try |
| * isolating, as that would be expensive. |
| */ |
| if (num_movable && |
| (PageLRU(page) || __PageMovable(page))) |
| (*num_movable)++; |
| pfn++; |
| continue; |
| } |
| |
| /* Make sure we are not inadvertently changing nodes */ |
| VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); |
| VM_BUG_ON_PAGE(page_zone(page) != zone, page); |
| |
| order = buddy_order(page); |
| move_to_free_list(page, zone, order, migratetype); |
| pfn += 1 << order; |
| pages_moved += 1 << order; |
| } |
| |
| return pages_moved; |
| } |
| |
| int move_freepages_block(struct zone *zone, struct page *page, |
| int migratetype, int *num_movable) |
| { |
| unsigned long start_pfn, end_pfn, pfn; |
| |
| if (num_movable) |
| *num_movable = 0; |
| |
| pfn = page_to_pfn(page); |
| start_pfn = pageblock_start_pfn(pfn); |
| end_pfn = pageblock_end_pfn(pfn) - 1; |
| |
| /* Do not cross zone boundaries */ |
| if (!zone_spans_pfn(zone, start_pfn)) |
| start_pfn = pfn; |
| if (!zone_spans_pfn(zone, end_pfn)) |
| return 0; |
| |
| return move_freepages(zone, start_pfn, end_pfn, migratetype, |
| num_movable); |
| } |
| |
| static void change_pageblock_range(struct page *pageblock_page, |
| int start_order, int migratetype) |
| { |
| int nr_pageblocks = 1 << (start_order - pageblock_order); |
| |
| while (nr_pageblocks--) { |
| set_pageblock_migratetype(pageblock_page, migratetype); |
| pageblock_page += pageblock_nr_pages; |
| } |
| } |
| |
| /* |
| * When we are falling back to another migratetype during allocation, try to |
| * steal extra free pages from the same pageblocks to satisfy further |
| * allocations, instead of polluting multiple pageblocks. |
| * |
| * If we are stealing a relatively large buddy page, it is likely there will |
| * be more free pages in the pageblock, so try to steal them all. For |
| * reclaimable and unmovable allocations, we steal regardless of page size, |
| * as fragmentation caused by those allocations polluting movable pageblocks |
| * is worse than movable allocations stealing from unmovable and reclaimable |
| * pageblocks. |
| */ |
| static bool can_steal_fallback(unsigned int order, int start_mt) |
| { |
| /* |
| * Leaving this order check is intended, although there is |
| * relaxed order check in next check. The reason is that |
| * we can actually steal whole pageblock if this condition met, |
| * but, below check doesn't guarantee it and that is just heuristic |
| * so could be changed anytime. |
| */ |
| if (order >= pageblock_order) |
| return true; |
| |
| if (order >= pageblock_order / 2 || |
| start_mt == MIGRATE_RECLAIMABLE || |
| start_mt == MIGRATE_UNMOVABLE || |
| page_group_by_mobility_disabled) |
| return true; |
| |
| return false; |
| } |
| |
| static inline bool boost_watermark(struct zone *zone) |
| { |
| unsigned long max_boost; |
| |
| if (!watermark_boost_factor) |
| return false; |
| /* |
| * Don't bother in zones that are unlikely to produce results. |
| * On small machines, including kdump capture kernels running |
| * in a small area, boosting the watermark can cause an out of |
| * memory situation immediately. |
| */ |
| if ((pageblock_nr_pages * 4) > zone_managed_pages(zone)) |
| return false; |
| |
| max_boost = mult_frac(zone->_watermark[WMARK_HIGH], |
| watermark_boost_factor, 10000); |
| |
| /* |
| * high watermark may be uninitialised if fragmentation occurs |
| * very early in boot so do not boost. We do not fall |
| * through and boost by pageblock_nr_pages as failing |
| * allocations that early means that reclaim is not going |
| * to help and it may even be impossible to reclaim the |
| * boosted watermark resulting in a hang. |
| */ |
| if (!max_boost) |
| return false; |
| |
| max_boost = max(pageblock_nr_pages, max_boost); |
| |
| zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, |
| max_boost); |
| |
| return true; |
| } |
| |
| /* |
| * This function implements actual steal behaviour. If order is large enough, |
| * we can steal whole pageblock. If not, we first move freepages in this |
| * pageblock to our migratetype and determine how many already-allocated pages |
| * are there in the pageblock with a compatible migratetype. If at least half |
| * of pages are free or compatible, we can change migratetype of the pageblock |
| * itself, so pages freed in the future will be put on the correct free list. |
| */ |
| static void steal_suitable_fallback(struct zone *zone, struct page *page, |
| unsigned int alloc_flags, int start_type, bool whole_block) |
| { |
| unsigned int current_order = buddy_order(page); |
| int free_pages, movable_pages, alike_pages; |
| int old_block_type; |
| |
| old_block_type = get_pageblock_migratetype(page); |
| |
| /* |
| * This can happen due to races and we want to prevent broken |
| * highatomic accounting. |
| */ |
| if (is_migrate_highatomic(old_block_type)) |
| goto single_page; |
| |
| /* Take ownership for orders >= pageblock_order */ |
| if (current_order >= pageblock_order) { |
| change_pageblock_range(page, current_order, start_type); |
| goto single_page; |
| } |
| |
| /* |
| * Boost watermarks to increase reclaim pressure to reduce the |
| * likelihood of future fallbacks. Wake kswapd now as the node |
| * may be balanced overall and kswapd will not wake naturally. |
| */ |
| if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD)) |
| set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags); |
| |
| /* We are not allowed to try stealing from the whole block */ |
| if (!whole_block) |
| goto single_page; |
| |
| free_pages = move_freepages_block(zone, page, start_type, |
| &movable_pages); |
| /* |
| * Determine how many pages are compatible with our allocation. |
| * For movable allocation, it's the number of movable pages which |
| * we just obtained. For other types it's a bit more tricky. |
| */ |
| if (start_type == MIGRATE_MOVABLE) { |
| alike_pages = movable_pages; |
| } else { |
| /* |
| * If we are falling back a RECLAIMABLE or UNMOVABLE allocation |
| * to MOVABLE pageblock, consider all non-movable pages as |
| * compatible. If it's UNMOVABLE falling back to RECLAIMABLE or |
| * vice versa, be conservative since we can't distinguish the |
| * exact migratetype of non-movable pages. |
| */ |
| if (old_block_type == MIGRATE_MOVABLE) |
| alike_pages = pageblock_nr_pages |
| - (free_pages + movable_pages); |
| else |
| alike_pages = 0; |
| } |
| |
| /* moving whole block can fail due to zone boundary conditions */ |
| if (!free_pages) |
| goto single_page; |
| |
| /* |
| * If a sufficient number of pages in the block are either free or of |
| * comparable migratability as our allocation, claim the whole block. |
| */ |
| if (free_pages + alike_pages >= (1 << (pageblock_order-1)) || |
| page_group_by_mobility_disabled) |
| set_pageblock_migratetype(page, start_type); |
| |
| return; |
| |
| single_page: |
| move_to_free_list(page, zone, current_order, start_type); |
| } |
| |
| /* |
| * Check whether there is a suitable fallback freepage with requested order. |
| * If only_stealable is true, this function returns fallback_mt only if |
| * we can steal other freepages all together. This would help to reduce |
| * fragmentation due to mixed migratetype pages in one pageblock. |
| */ |
| int find_suitable_fallback(struct free_area *area, unsigned int order, |
| int migratetype, bool only_stealable, bool *can_steal) |
| { |
| int i; |
| int fallback_mt; |
| |
| if (area->nr_free == 0) |
| return -1; |
| |
| *can_steal = false; |
| for (i = 0;; i++) { |
| fallback_mt = fallbacks[migratetype][i]; |
| if (fallback_mt == MIGRATE_TYPES) |
| break; |
| |
| if (free_area_empty(area, fallback_mt)) |
| continue; |
| |
| if (can_steal_fallback(order, migratetype)) |
| *can_steal = true; |
| |
| if (!only_stealable) |
| return fallback_mt; |
| |
| if (*can_steal) |
| return fallback_mt; |
| } |
| |
| return -1; |
| } |
| |
| /* |
| * Reserve a pageblock for exclusive use of high-order atomic allocations if |
| * there are no empty page blocks that contain a page with a suitable order |
| */ |
| static void reserve_highatomic_pageblock(struct page *page, struct zone *zone, |
| unsigned int alloc_order) |
| { |
| int mt; |
| unsigned long max_managed, flags; |
| |
| /* |
| * Limit the number reserved to 1 pageblock or roughly 1% of a zone. |
| * Check is race-prone but harmless. |
| */ |
| max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages; |
| if (zone->nr_reserved_highatomic >= max_managed) |
| return; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| |
| /* Recheck the nr_reserved_highatomic limit under the lock */ |
| if (zone->nr_reserved_highatomic >= max_managed) |
| goto out_unlock; |
| |
| /* Yoink! */ |
| mt = get_pageblock_migratetype(page); |
| /* Only reserve normal pageblocks (i.e., they can merge with others) */ |
| if (migratetype_is_mergeable(mt)) { |
| zone->nr_reserved_highatomic += pageblock_nr_pages; |
| set_pageblock_migratetype(page, MIGRATE_HIGHATOMIC); |
| move_freepages_block(zone, page, MIGRATE_HIGHATOMIC, NULL); |
| } |
| |
| out_unlock: |
| spin_unlock_irqrestore(&zone->lock, flags); |
| } |
| |
| /* |
| * Used when an allocation is about to fail under memory pressure. This |
| * potentially hurts the reliability of high-order allocations when under |
| * intense memory pressure but failed atomic allocations should be easier |
| * to recover from than an OOM. |
| * |
| * If @force is true, try to unreserve a pageblock even though highatomic |
| * pageblock is exhausted. |
| */ |
| static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, |
| bool force) |
| { |
| struct zonelist *zonelist = ac->zonelist; |
| unsigned long flags; |
| struct zoneref *z; |
| struct zone *zone; |
| struct page *page; |
| int order; |
| bool ret; |
| |
| for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx, |
| ac->nodemask) { |
| /* |
| * Preserve at least one pageblock unless memory pressure |
| * is really high. |
| */ |
| if (!force && zone->nr_reserved_highatomic <= |
| pageblock_nr_pages) |
| continue; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| for (order = 0; order < MAX_ORDER; order++) { |
| struct free_area *area = &(zone->free_area[order]); |
| |
| page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC); |
| if (!page) |
| continue; |
| |
| /* |
| * In page freeing path, migratetype change is racy so |
| * we can counter several free pages in a pageblock |
| * in this loop although we changed the pageblock type |
| * from highatomic to ac->migratetype. So we should |
| * adjust the count once. |
| */ |
| if (is_migrate_highatomic_page(page)) { |
| /* |
| * It should never happen but changes to |
| * locking could inadvertently allow a per-cpu |
| * drain to add pages to MIGRATE_HIGHATOMIC |
| * while unreserving so be safe and watch for |
| * underflows. |
| */ |
| zone->nr_reserved_highatomic -= min( |
| pageblock_nr_pages, |
| zone->nr_reserved_highatomic); |
| } |
| |
| /* |
| * Convert to ac->migratetype and avoid the normal |
| * pageblock stealing heuristics. Minimally, the caller |
| * is doing the work and needs the pages. More |
| * importantly, if the block was always converted to |
| * MIGRATE_UNMOVABLE or another type then the number |
| * of pageblocks that cannot be completely freed |
| * may increase. |
| */ |
| set_pageblock_migratetype(page, ac->migratetype); |
| ret = move_freepages_block(zone, page, ac->migratetype, |
| NULL); |
| if (ret) { |
| spin_unlock_irqrestore(&zone->lock, flags); |
| return ret; |
| } |
| } |
| spin_unlock_irqrestore(&zone->lock, flags); |
| } |
| |
| return false; |
| } |
| |
| /* |
| * Try finding a free buddy page on the fallback list and put it on the free |
| * list of requested migratetype, possibly along with other pages from the same |
| * block, depending on fragmentation avoidance heuristics. Returns true if |
| * fallback was found so that __rmqueue_smallest() can grab it. |
| * |
| * The use of signed ints for order and current_order is a deliberate |
| * deviation from the rest of this file, to make the for loop |
| * condition simpler. |
| */ |
| static __always_inline bool |
| __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, |
| unsigned int alloc_flags) |
| { |
| struct free_area *area; |
| int current_order; |
| int min_order = order; |
| struct page *page; |
| int fallback_mt; |
| bool can_steal; |
| |
| /* |
| * Do not steal pages from freelists belonging to other pageblocks |
| * i.e. orders < pageblock_order. If there are no local zones free, |
| * the zonelists will be reiterated without ALLOC_NOFRAGMENT. |
| */ |
| if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT) |
| min_order = pageblock_order; |
| |
| /* |
| * Find the largest available free page in the other list. This roughly |
| * approximates finding the pageblock with the most free pages, which |
| * would be too costly to do exactly. |
| */ |
| for (current_order = MAX_ORDER - 1; current_order >= min_order; |
| --current_order) { |
| area = &(zone->free_area[current_order]); |
| fallback_mt = find_suitable_fallback(area, current_order, |
| start_migratetype, false, &can_steal); |
| if (fallback_mt == -1) |
| continue; |
| |
| /* |
| * We cannot steal all free pages from the pageblock and the |
| * requested migratetype is movable. In that case it's better to |
| * steal and split the smallest available page instead of the |
| * largest available page, because even if the next movable |
| * allocation falls back into a different pageblock than this |
| * one, it won't cause permanent fragmentation. |
| */ |
| if (!can_steal && start_migratetype == MIGRATE_MOVABLE |
| && current_order > order) |
| goto find_smallest; |
| |
| goto do_steal; |
| } |
| |
| return false; |
| |
| find_smallest: |
| for (current_order = order; current_order < MAX_ORDER; |
| current_order++) { |
| area = &(zone->free_area[current_order]); |
| fallback_mt = find_suitable_fallback(area, current_order, |
| start_migratetype, false, &can_steal); |
| if (fallback_mt != -1) |
| break; |
| } |
| |
| /* |
| * This should not happen - we already found a suitable fallback |
| * when looking for the largest page. |
| */ |
| VM_BUG_ON(current_order == MAX_ORDER); |
| |
| do_steal: |
| page = get_page_from_free_area(area, fallback_mt); |
| |
| steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, |
| can_steal); |
| |
| trace_mm_page_alloc_extfrag(page, order, current_order, |
| start_migratetype, fallback_mt); |
| |
| return true; |
| |
| } |
| |
| /* |
| * Do the hard work of removing an element from the buddy allocator. |
| * Call me with the zone->lock already held. |
| */ |
| static __always_inline struct page * |
| __rmqueue(struct zone *zone, unsigned int order, int migratetype, |
| unsigned int alloc_flags) |
| { |
| struct page *page; |
| |
| if (IS_ENABLED(CONFIG_CMA)) { |
| /* |
| * Balance movable allocations between regular and CMA areas by |
| * allocating from CMA when over half of the zone's free memory |
| * is in the CMA area. |
| */ |
| if (alloc_flags & ALLOC_CMA && |
| zone_page_state(zone, NR_FREE_CMA_PAGES) > |
| zone_page_state(zone, NR_FREE_PAGES) / 2) { |
| page = __rmqueue_cma_fallback(zone, order); |
| if (page) |
| return page; |
| } |
| } |
| retry: |
| page = __rmqueue_smallest(zone, order, migratetype); |
| if (unlikely(!page)) { |
| if (alloc_flags & ALLOC_CMA) |
| page = __rmqueue_cma_fallback(zone, order); |
| |
| if (!page && __rmqueue_fallback(zone, order, migratetype, |
| alloc_flags)) |
| goto retry; |
| } |
| return page; |
| } |
| |
| /* |
| * Obtain a specified number of elements from the buddy allocator, all under |
| * a single hold of the lock, for efficiency. Add them to the supplied list. |
| * Returns the number of new pages which were placed at *list. |
| */ |
| static int rmqueue_bulk(struct zone *zone, unsigned int order, |
| unsigned long count, struct list_head *list, |
| int migratetype, unsigned int alloc_flags) |
| { |
| unsigned long flags; |
| int i, allocated = 0; |
| |
| spin_lock_irqsave(&zone->lock, flags); |
| for (i = 0; i < count; ++i) { |
| struct page *page = __rmqueue(zone, order, migratetype, |
| alloc_flags); |
| if (unlikely(page == NULL)) |
| break; |
| |
| if (unlikely(check_pcp_refill(page, order))) |
| continue; |
| |
| /* |
| * Split buddy pages returned by expand() are received here in |
| * physical page order. The page is added to the tail of |
| * caller's list. From the callers perspective, the linked list |
| * is ordered by page number under some conditions. This is |
| * useful for IO devices that can forward direction from the |
| * head, thus also in the physical page order. This is useful |
| * for IO devices that can merge IO requests if the physical |
| * pages are ordered properly. |
| */ |
| list_add_tail(&page->pcp_list, list); |
| allocated++; |
| if (is_migrate_cma(get_pcppage_migratetype(page))) |
| __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, |
| -(1 << order)); |
| } |
| |
| /* |
| * i pages were removed from the buddy list even if some leak due |
| * to check_pcp_refill failing so adjust NR_FREE_PAGES based |
| * on i. Do not confuse with 'allocated' which is the number of |
| * pages added to the pcp list. |
| */ |
| __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); |
| spin_unlock_irqrestore(&zone->lock, flags); |
| return allocated; |
| } |
| |
| #ifdef CONFIG_NUMA |
| /* |
| * Called from the vmstat counter updater to drain pagesets of this |
| * currently executing processor on remote nodes after they have |
| * expired. |
| */ |
| void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) |
| { |
| int to_drain, batch; |
| |
| batch = READ_ONCE(pcp->batch); |
| to_drain = min(pcp->count, batch); |
| if (to_drain > 0) { |
| spin_lock(&pcp->lock); |
| free_pcppages_bulk(zone, to_drain, pcp, 0); |
| spin_unlock(&pcp->lock); |
| } |
| } |
| #endif |
| |
| /* |
| * Drain pcplists of the indicated processor and zone. |
| */ |
| static void drain_pages_zone(unsigned int cpu, struct zone *zone) |
| { |
| struct per_cpu_pages *pcp; |
| |
| pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); |
| if (pcp->count) { |
| spin_lock(&pcp->lock); |
| free_pcppages_bulk(zone, pcp->count, pcp, 0); |
| spin_unlock(&pcp->lock); |
| } |
| } |
| |
| /* |
| * Drain pcplists of all zones on the indicated processor. |
| */ |
| static void drain_pages(unsigned int cpu) |
| { |
| struct zone *zone; |
| |
| for_each_populated_zone(zone) { |
| drain_pages_zone(cpu, zone); |
| } |
| } |
| |
| /* |
| * Spill all of this CPU's per-cpu pages back into the buddy allocator. |
| */ |
| void drain_local_pages(struct zone *zone) |
| { |
| int cpu = smp_processor_id(); |
| |
| if (zone) |
| drain_pages_zone(cpu, zone); |
| else |
| drain_pages(cpu); |
| } |
| |
| /* |
| * The implementation of drain_all_pages(), exposing an extra parameter to |
| * drain on all cpus. |
| * |
| * drain_all_pages() is optimized to only execute on cpus where pcplists are |
| * not empty. The check for non-emptiness can however race with a free to |
| * pcplist that has not yet increased the pcp->count from 0 to 1. Callers |
| * that need the guarantee that every CPU has drained can disable the |
| * optimizing racy check. |
| */ |
| static void __drain_all_pages(struct zone *zone, bool force_all_cpus) |
| { |
| int cpu; |
| |
| /* |
| * Allocate in the BSS so we won't require allocation in |
| * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y |
| */ |
| static cpumask_t cpus_with_pcps; |
| |
| /* |
| * Do not drain if one is already in progress unless it's specific to |
| * a zone. Such callers are primarily CMA and memory hotplug and need |
| * the drain to be complete when the call returns. |
| */ |
| if (unlikely(!mutex_trylock(&pcpu_drain_mutex))) { |
| if (!zone) |
| return; |
| mutex_lock(&pcpu_drain_mutex); |
| } |
| |
| /* |
| * We don't care about racing with CPU hotplug event |
| * as offline notification will cause the notified |
| * cpu to drain that CPU pcps and on_each_cpu_mask |
| * disables preemption as part of its processing |
| */ |
| for_each_online_cpu(cpu) { |
| struct per_cpu_pages *pcp; |
| struct zone *z; |
| bool has_pcps = false; |
| |
| if (force_all_cpus) { |
| /* |
| * The pcp.count check is racy, some callers need a |
| * guarantee that no cpu is missed. |
| */ |
| has_pcps = true; |
| } else if (zone) { |
| pcp = per_cpu_ptr(zone->per_cpu_pageset, cpu); |
| if (pcp->count) |
| <
|