]> bbs.cooldavid.org Git - net-next-2.6.git/commitdiff
mm: compaction: direct compact when a high-order allocation fails
authorMel Gorman <mel@csn.ul.ie>
Mon, 24 May 2010 21:32:30 +0000 (14:32 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 25 May 2010 15:06:59 +0000 (08:06 -0700)
Ordinarily when a high-order allocation fails, direct reclaim is entered
to free pages to satisfy the allocation.  With this patch, it is
determined if an allocation failed due to external fragmentation instead
of low memory and if so, the calling process will compact until a suitable
page is freed.  Compaction by moving pages in memory is considerably
cheaper than paging out to disk and works where there are locked pages or
no swap.  If compaction fails to free a page of a suitable size, then
reclaim will still occur.

Direct compaction returns as soon as possible.  As each block is
compacted, it is checked if a suitable page has been freed and if so, it
returns.

[akpm@linux-foundation.org: Fix build errors]
[aarcange@redhat.com: fix count_vm_event preempt in memory compaction direct reclaim]
Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Acked-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/compaction.h
include/linux/vmstat.h
mm/compaction.c
mm/page_alloc.c
mm/vmstat.c

index ba98cfe0ae1502a81a7ed13c1bb571856bee9c6e..eed40ec4280b2275e210ca848674df99174528b5 100644 (file)
@@ -1,15 +1,31 @@
 #ifndef _LINUX_COMPACTION_H
 #define _LINUX_COMPACTION_H
 
-/* Return values for compact_zone() */
-#define COMPACT_CONTINUE       0
-#define COMPACT_PARTIAL                1
-#define COMPACT_COMPLETE       2
+/* Return values for compact_zone() and try_to_compact_pages() */
+/* compaction didn't start as it was not possible or direct reclaim was more suitable */
+#define COMPACT_SKIPPED                0
+/* compaction should continue to another pageblock */
+#define COMPACT_CONTINUE       1
+/* direct compaction partially compacted a zone and there are suitable pages */
+#define COMPACT_PARTIAL                2
+/* The full zone was compacted */
+#define COMPACT_COMPLETE       3
 
 #ifdef CONFIG_COMPACTION
 extern int sysctl_compact_memory;
 extern int sysctl_compaction_handler(struct ctl_table *table, int write,
                        void __user *buffer, size_t *length, loff_t *ppos);
+
+extern int fragmentation_index(struct zone *zone, unsigned int order);
+extern unsigned long try_to_compact_pages(struct zonelist *zonelist,
+                       int order, gfp_t gfp_mask, nodemask_t *mask);
+#else
+static inline unsigned long try_to_compact_pages(struct zonelist *zonelist,
+                       int order, gfp_t gfp_mask, nodemask_t *nodemask)
+{
+       return COMPACT_CONTINUE;
+}
+
 #endif /* CONFIG_COMPACTION */
 
 #if defined(CONFIG_COMPACTION) && defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
index b421d1b22b628719491c6709e8545f6a73a94ed9..7f43ccdc1d38c0eb919efe4891e1b91ec19c3705 100644 (file)
@@ -45,6 +45,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
                PAGEOUTRUN, ALLOCSTALL, PGROTATED,
 #ifdef CONFIG_COMPACTION
                COMPACTBLOCKS, COMPACTPAGES, COMPACTPAGEFAILED,
+               COMPACTSTALL, COMPACTFAIL, COMPACTSUCCESS,
 #endif
 #ifdef CONFIG_HUGETLB_PAGE
                HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL,
index f61f77983ff4e1ec15759370c3caa4ba199254ca..9583e193dc47b3e0f5728629bec48a94685de037 100644 (file)
@@ -35,6 +35,8 @@ struct compact_control {
        unsigned long nr_anon;
        unsigned long nr_file;
 
+       unsigned int order;             /* order a direct compactor needs */
+       int migratetype;                /* MOVABLE, RECLAIMABLE etc */
        struct zone *zone;
 };
 
@@ -341,6 +343,9 @@ static void update_nr_listpages(struct compact_control *cc)
 static int compact_finished(struct zone *zone,
                                                struct compact_control *cc)
 {
+       unsigned int order;
+       unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
+
        if (fatal_signal_pending(current))
                return COMPACT_PARTIAL;
 
@@ -348,6 +353,24 @@ static int compact_finished(struct zone *zone,
        if (cc->free_pfn <= cc->migrate_pfn)
                return COMPACT_COMPLETE;
 
+       /* Compaction run is not finished if the watermark is not met */
+       if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
+               return COMPACT_CONTINUE;
+
+       if (cc->order == -1)
+               return COMPACT_CONTINUE;
+
+       /* Direct compactor: Is a suitable page free? */
+       for (order = cc->order; order < MAX_ORDER; order++) {
+               /* Job done if page is free of the right migratetype */
+               if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
+                       return COMPACT_PARTIAL;
+
+               /* Job done if allocation would set block type */
+               if (order >= pageblock_order && zone->free_area[order].nr_free)
+                       return COMPACT_PARTIAL;
+       }
+
        return COMPACT_CONTINUE;
 }
 
@@ -394,6 +417,99 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
        return ret;
 }
 
+static unsigned long compact_zone_order(struct zone *zone,
+                                               int order, gfp_t gfp_mask)
+{
+       struct compact_control cc = {
+               .nr_freepages = 0,
+               .nr_migratepages = 0,
+               .order = order,
+               .migratetype = allocflags_to_migratetype(gfp_mask),
+               .zone = zone,
+       };
+       INIT_LIST_HEAD(&cc.freepages);
+       INIT_LIST_HEAD(&cc.migratepages);
+
+       return compact_zone(zone, &cc);
+}
+
+/**
+ * try_to_compact_pages - Direct compact to satisfy a high-order allocation
+ * @zonelist: The zonelist used for the current allocation
+ * @order: The order of the current allocation
+ * @gfp_mask: The GFP mask of the current allocation
+ * @nodemask: The allowed nodes to allocate from
+ *
+ * This is the main entry point for direct page compaction.
+ */
+unsigned long try_to_compact_pages(struct zonelist *zonelist,
+                       int order, gfp_t gfp_mask, nodemask_t *nodemask)
+{
+       enum zone_type high_zoneidx = gfp_zone(gfp_mask);
+       int may_enter_fs = gfp_mask & __GFP_FS;
+       int may_perform_io = gfp_mask & __GFP_IO;
+       unsigned long watermark;
+       struct zoneref *z;
+       struct zone *zone;
+       int rc = COMPACT_SKIPPED;
+
+       /*
+        * Check whether it is worth even starting compaction. The order check is
+        * made because an assumption is made that the page allocator can satisfy
+        * the "cheaper" orders without taking special steps
+        */
+       if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
+               return rc;
+
+       count_vm_event(COMPACTSTALL);
+
+       /* Compact each zone in the list */
+       for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
+                                                               nodemask) {
+               int fragindex;
+               int status;
+
+               /*
+                * Watermarks for order-0 must be met for compaction. Note
+                * the 2UL. This is because during migration, copies of
+                * pages need to be allocated and for a short time, the
+                * footprint is higher
+                */
+               watermark = low_wmark_pages(zone) + (2UL << order);
+               if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
+                       continue;
+
+               /*
+                * fragmentation index determines if allocation failures are
+                * due to low memory or external fragmentation
+                *
+                * index of -1 implies allocations might succeed depending
+                *      on watermarks
+                * index towards 0 implies failure is due to lack of memory
+                * index towards 1000 implies failure is due to fragmentation
+                *
+                * Only compact if a failure would be due to fragmentation.
+                */
+               fragindex = fragmentation_index(zone, order);
+               if (fragindex >= 0 && fragindex <= 500)
+                       continue;
+
+               if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
+                       rc = COMPACT_PARTIAL;
+                       break;
+               }
+
+               status = compact_zone_order(zone, order, gfp_mask);
+               rc = max(status, rc);
+
+               if (zone_watermark_ok(zone, order, watermark, 0, 0))
+                       break;
+       }
+
+       return rc;
+}
+
+
 /* Compact all zones within a node */
 static int compact_node(int nid)
 {
@@ -412,6 +528,7 @@ static int compact_node(int nid)
                struct compact_control cc = {
                        .nr_freepages = 0,
                        .nr_migratepages = 0,
+                       .order = -1,
                };
 
                zone = &pgdat->node_zones[zoneid];
index c54376a09f309523e0e133e3931604ac4f9752aa..cd88a860f088aac5cc19781c5e1aa02730265897 100644 (file)
@@ -49,6 +49,7 @@
 #include <linux/debugobjects.h>
 #include <linux/kmemleak.h>
 #include <linux/memory.h>
+#include <linux/compaction.h>
 #include <trace/events/kmem.h>
 #include <linux/ftrace_event.h>
 
@@ -1758,6 +1759,59 @@ out:
        return page;
 }
 
+#ifdef CONFIG_COMPACTION
+/* Try memory compaction for high-order allocations before reclaim */
+static struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+       struct zonelist *zonelist, enum zone_type high_zoneidx,
+       nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+       int migratetype, unsigned long *did_some_progress)
+{
+       struct page *page;
+
+       if (!order)
+               return NULL;
+
+       *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask,
+                                                               nodemask);
+       if (*did_some_progress != COMPACT_SKIPPED) {
+
+               /* Page migration frees to the PCP lists but we want merging */
+               drain_pages(get_cpu());
+               put_cpu();
+
+               page = get_page_from_freelist(gfp_mask, nodemask,
+                               order, zonelist, high_zoneidx,
+                               alloc_flags, preferred_zone,
+                               migratetype);
+               if (page) {
+                       count_vm_event(COMPACTSUCCESS);
+                       return page;
+               }
+
+               /*
+                * It's bad if compaction run occurs and fails.
+                * The most likely reason is that pages exist,
+                * but not enough to satisfy watermarks.
+                */
+               count_vm_event(COMPACTFAIL);
+
+               cond_resched();
+       }
+
+       return NULL;
+}
+#else
+static inline struct page *
+__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
+       struct zonelist *zonelist, enum zone_type high_zoneidx,
+       nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
+       int migratetype, unsigned long *did_some_progress)
+{
+       return NULL;
+}
+#endif /* CONFIG_COMPACTION */
+
 /* The really slow allocator path where we enter direct reclaim */
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -1944,6 +1998,15 @@ rebalance:
        if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
                goto nopage;
 
+       /* Try direct compaction */
+       page = __alloc_pages_direct_compact(gfp_mask, order,
+                                       zonelist, high_zoneidx,
+                                       nodemask,
+                                       alloc_flags, preferred_zone,
+                                       migratetype, &did_some_progress);
+       if (page)
+               goto got_pg;
+
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order,
                                        zonelist, high_zoneidx,
index c6aacf51b554545a89f2f0b23b6c3df2b8eb0a75..7759941d4e7760616e1bf89c0ed887f485b473a9 100644 (file)
@@ -429,7 +429,7 @@ static void fill_contig_page_info(struct zone *zone,
  * The value can be used to determine if page reclaim or compaction
  * should be used
  */
-int fragmentation_index(unsigned int order, struct contig_page_info *info)
+static int __fragmentation_index(unsigned int order, struct contig_page_info *info)
 {
        unsigned long requested = 1UL << order;
 
@@ -448,6 +448,15 @@ int fragmentation_index(unsigned int order, struct contig_page_info *info)
         */
        return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
 }
+
+/* Same as __fragmentation index but allocs contig_page_info on stack */
+int fragmentation_index(struct zone *zone, unsigned int order)
+{
+       struct contig_page_info info;
+
+       fill_contig_page_info(zone, order, &info);
+       return __fragmentation_index(order, &info);
+}
 #endif
 
 #if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
@@ -771,6 +780,9 @@ static const char * const vmstat_text[] = {
        "compact_blocks_moved",
        "compact_pages_moved",
        "compact_pagemigrate_failed",
+       "compact_stall",
+       "compact_fail",
+       "compact_success",
 #endif
 
 #ifdef CONFIG_HUGETLB_PAGE
@@ -1136,7 +1148,7 @@ static void extfrag_show_print(struct seq_file *m,
                                zone->name);
        for (order = 0; order < MAX_ORDER; ++order) {
                fill_contig_page_info(zone, order, &info);
-               index = fragmentation_index(order, &info);
+               index = __fragmentation_index(order, &info);
                seq_printf(m, "%d.%03d ", index / 1000, index % 1000);
        }