[net-next-2.6.git] / mm / compaction.c

/*
 * linux/mm/compaction.c
 *
 * Memory compaction for the reduction of external fragmentation. Note that
 * this heavily depends upon page migration to do all the real heavy
 * lifting
 *
 * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
 */
#include <linux/swap.h>
#include <linux/migrate.h>
#include <linux/compaction.h>
#include <linux/mm_inline.h>
#include <linux/backing-dev.h>
#include <linux/sysctl.h>
#include <linux/sysfs.h>
#include "internal.h"

/*
 * compact_control is used to track pages being migrated and the free pages
 * they are being migrated to during memory compaction. The free_pfn starts
 * at the end of a zone and migrate_pfn begins at the start. Movable pages
 * are moved to the end of a zone during a compaction run and the run
 * completes when free_pfn <= migrate_pfn
 */
struct compact_control {
	struct list_head freepages;	/* List of free pages to migrate to */
	struct list_head migratepages;	/* List of pages being migrated */
	unsigned long nr_freepages;	/* Number of isolated free pages */
	unsigned long nr_migratepages;	/* Number of pages to migrate */
	unsigned long free_pfn;		/* isolate_freepages search base */
	unsigned long migrate_pfn;	/* isolate_migratepages search base */

	/* Account for isolated anon and file pages */
	unsigned long nr_anon;
	unsigned long nr_file;

	unsigned int order;		/* order a direct compactor needs */
	int migratetype;		/* MOVABLE, RECLAIMABLE etc */
	struct zone *zone;
};

static unsigned long release_freepages(struct list_head *freelist)
{
	struct page *page, *next;
	unsigned long count = 0;

	list_for_each_entry_safe(page, next, freelist, lru) {
		list_del(&page->lru);
		__free_page(page);
		count++;
	}

	return count;
}

/* Isolate free pages onto a private freelist. Must hold zone->lock */
static unsigned long isolate_freepages_block(struct zone *zone,
				unsigned long blockpfn,
				struct list_head *freelist)
{
	unsigned long zone_end_pfn, end_pfn;
	int total_isolated = 0;
	struct page *cursor;

	/* Get the last PFN we should scan for free pages at */
	zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
	end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);

	/* Find the first usable PFN in the block to initialse page cursor */
	for (; blockpfn < end_pfn; blockpfn++) {
		if (pfn_valid_within(blockpfn))
			break;
	}
	cursor = pfn_to_page(blockpfn);

	/* Isolate free pages. This assumes the block is valid */
	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
		int isolated, i;
		struct page *page = cursor;

		if (!pfn_valid_within(blockpfn))
			continue;

		if (!PageBuddy(page))
			continue;

		/* Found a free page, break it into order-0 pages */
		isolated = split_free_page(page);
		total_isolated += isolated;
		for (i = 0; i < isolated; i++) {
			list_add(&page->lru, freelist);
			page++;
		}

		/* If a page was split, advance to the end of it */
		if (isolated) {
			blockpfn += isolated - 1;
			cursor += isolated - 1;
		}
	}

	return total_isolated;
}

/* Returns true if the page is within a block suitable for migration to */
static bool suitable_migration_target(struct page *page)
{

	int migratetype = get_pageblock_migratetype(page);

	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
	if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE)
		return false;

	/* If the page is a large free page, then allow migration */
	if (PageBuddy(page) && page_order(page) >= pageblock_order)
		return true;

	/* If the block is MIGRATE_MOVABLE, allow migration */
	if (migratetype == MIGRATE_MOVABLE)
		return true;

	/* Otherwise skip the block */
	return false;
}

/*
 * Based on information in the current compact_control, find blocks
 * suitable for isolating free pages from and then isolate them.
 */
static void isolate_freepages(struct zone *zone,
				struct compact_control *cc)
{
	struct page *page;
	unsigned long high_pfn, low_pfn, pfn;
	unsigned long flags;
	int nr_freepages = cc->nr_freepages;
	struct list_head *freelist = &cc->freepages;

	pfn = cc->free_pfn;
	low_pfn = cc->migrate_pfn + pageblock_nr_pages;
	high_pfn = low_pfn;

	/*
	 * Isolate free pages until enough are available to migrate the
	 * pages on cc->migratepages. We stop searching if the migrate
	 * and free page scanners meet or enough free pages are isolated.
	 */
	spin_lock_irqsave(&zone->lock, flags);
	for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
					pfn -= pageblock_nr_pages) {
		unsigned long isolated;

		if (!pfn_valid(pfn))
			continue;

		/*
		 * Check for overlapping nodes/zones. It's possible on some
		 * configurations to have a setup like
		 * node0 node1 node0
		 * i.e. it's possible that all pages within a zones range of
		 * pages do not belong to a single zone.
		 */
		page = pfn_to_page(pfn);
		if (page_zone(page) != zone)
			continue;

		/* Check the block is suitable for migration */
		if (!suitable_migration_target(page))
			continue;

		/* Found a block suitable for isolating free pages from */
		isolated = isolate_freepages_block(zone, pfn, freelist);
		nr_freepages += isolated;

		/*
		 * Record the highest PFN we isolated pages from. When next
		 * looking for free pages, the search will restart here as
		 * page migration may have returned some pages to the allocator
		 */
		if (isolated)
			high_pfn = max(high_pfn, pfn);
	}
	spin_unlock_irqrestore(&zone->lock, flags);

	/* split_free_page does not map the pages */
	list_for_each_entry(page, freelist, lru) {
		arch_alloc_page(page, 0);
		kernel_map_pages(page, 1, 1);
	}

	cc->free_pfn = high_pfn;
	cc->nr_freepages = nr_freepages;
}

/* Update the number of anon and file isolated pages in the zone */
static void acct_isolated(struct zone *zone, struct compact_control *cc)
{
	struct page *page;
	unsigned int count[NR_LRU_LISTS] = { 0, };

	list_for_each_entry(page, &cc->migratepages, lru) {
		int lru = page_lru_base_type(page);
		count[lru]++;
	}

	cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
	cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
	__mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
	__mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
}

/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(struct zone *zone)
{
	unsigned long active, inactive, isolated;

	inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
					zone_page_state(zone, NR_INACTIVE_ANON);
	active = zone_page_state(zone, NR_ACTIVE_FILE) +
					zone_page_state(zone, NR_ACTIVE_ANON);
	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
					zone_page_state(zone, NR_ISOLATED_ANON);

	return isolated > (inactive + active) / 2;
}

/*
 * Isolate all pages that can be migrated from the block pointed to by
 * the migrate scanner within compact_control.
 */
static unsigned long isolate_migratepages(struct zone *zone,
					struct compact_control *cc)
{
	unsigned long low_pfn, end_pfn;
	struct list_head *migratelist = &cc->migratepages;

	/* Do not scan outside zone boundaries */
	low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);

	/* Only scan within a pageblock boundary */
	end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);

	/* Do not cross the free scanner or scan within a memory hole */
	if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) {
		cc->migrate_pfn = end_pfn;
		return 0;
	}

	/*
	 * Ensure that there are not too many pages isolated from the LRU
	 * list by either parallel reclaimers or compaction. If there are,
	 * delay for some time until fewer pages are isolated
	 */
	while (unlikely(too_many_isolated(zone))) {
		congestion_wait(BLK_RW_ASYNC, HZ/10);

		if (fatal_signal_pending(current))
			return 0;
	}

	/* Time to isolate some pages for migration */
	spin_lock_irq(&zone->lru_lock);
	for (; low_pfn < end_pfn; low_pfn++) {
		struct page *page;
		if (!pfn_valid_within(low_pfn))
			continue;

		/* Get the page and skip if free */
		page = pfn_to_page(low_pfn);
		if (PageBuddy(page))
			continue;

		/* Try isolate the page */
		if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
			continue;

		/* Successfully isolated */
		del_page_from_lru_list(zone, page, page_lru(page));
		list_add(&page->lru, migratelist);
		mem_cgroup_del_lru(page);
		cc->nr_migratepages++;

		/* Avoid isolating too much */
		if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
			break;
	}

	acct_isolated(zone, cc);

	spin_unlock_irq(&zone->lru_lock);
	cc->migrate_pfn = low_pfn;

	return cc->nr_migratepages;
}

/*
 * This is a migrate-callback that "allocates" freepages by taking pages
 * from the isolated freelists in the block we are migrating to.
 */
static struct page *compaction_alloc(struct page *migratepage,
					unsigned long data,
					int **result)
{
	struct compact_control *cc = (struct compact_control *)data;
	struct page *freepage;

	/* Isolate free pages if necessary */
	if (list_empty(&cc->freepages)) {
		isolate_freepages(cc->zone, cc);

		if (list_empty(&cc->freepages))
			return NULL;
	}

	freepage = list_entry(cc->freepages.next, struct page, lru);
	list_del(&freepage->lru);
	cc->nr_freepages--;

	return freepage;
}

/*
 * We cannot control nr_migratepages and nr_freepages fully when migration is
 * running as migrate_pages() has no knowledge of compact_control. When
 * migration is complete, we count the number of pages on the lists by hand.
 */
static void update_nr_listpages(struct compact_control *cc)
{
	int nr_migratepages = 0;
	int nr_freepages = 0;
	struct page *page;

	list_for_each_entry(page, &cc->migratepages, lru)
		nr_migratepages++;
	list_for_each_entry(page, &cc->freepages, lru)
		nr_freepages++;

	cc->nr_migratepages = nr_migratepages;
	cc->nr_freepages = nr_freepages;
}

static int compact_finished(struct zone *zone,
						struct compact_control *cc)
{
	unsigned int order;
	unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);

	if (fatal_signal_pending(current))
		return COMPACT_PARTIAL;

	/* Compaction run completes if the migrate and free scanner meet */
	if (cc->free_pfn <= cc->migrate_pfn)
		return COMPACT_COMPLETE;

	/* Compaction run is not finished if the watermark is not met */
	if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
		return COMPACT_CONTINUE;

	if (cc->order == -1)
		return COMPACT_CONTINUE;

	/* Direct compactor: Is a suitable page free? */
	for (order = cc->order; order < MAX_ORDER; order++) {
		/* Job done if page is free of the right migratetype */
		if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
			return COMPACT_PARTIAL;

		/* Job done if allocation would set block type */
		if (order >= pageblock_order && zone->free_area[order].nr_free)
			return COMPACT_PARTIAL;
	}

	return COMPACT_CONTINUE;
}

static int compact_zone(struct zone *zone, struct compact_control *cc)
{
	int ret;

	/* Setup to move all movable pages to the end of the zone */
	cc->migrate_pfn = zone->zone_start_pfn;
	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
	cc->free_pfn &= ~(pageblock_nr_pages-1);

	migrate_prep_local();

	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
		unsigned long nr_migrate, nr_remaining;

		if (!isolate_migratepages(zone, cc))
			continue;

		nr_migrate = cc->nr_migratepages;
		migrate_pages(&cc->migratepages, compaction_alloc,
						(unsigned long)cc, 0);
		update_nr_listpages(cc);
		nr_remaining = cc->nr_migratepages;

		count_vm_event(COMPACTBLOCKS);
		count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
		if (nr_remaining)
			count_vm_events(COMPACTPAGEFAILED, nr_remaining);

		/* Release LRU pages not migrated */
		if (!list_empty(&cc->migratepages)) {
			putback_lru_pages(&cc->migratepages);
			cc->nr_migratepages = 0;
		}

	}

	/* Release free pages and check accounting */
	cc->nr_freepages -= release_freepages(&cc->freepages);
	VM_BUG_ON(cc->nr_freepages != 0);

	return ret;
}

static unsigned long compact_zone_order(struct zone *zone,
						int order, gfp_t gfp_mask)
{
	struct compact_control cc = {
		.nr_freepages = 0,
		.nr_migratepages = 0,
		.order = order,
		.migratetype = allocflags_to_migratetype(gfp_mask),
		.zone = zone,
	};
	INIT_LIST_HEAD(&cc.freepages);
	INIT_LIST_HEAD(&cc.migratepages);

	return compact_zone(zone, &cc);
}

int sysctl_extfrag_threshold = 500;

/**
 * try_to_compact_pages - Direct compact to satisfy a high-order allocation
 * @zonelist: The zonelist used for the current allocation
 * @order: The order of the current allocation
 * @gfp_mask: The GFP mask of the current allocation
 * @nodemask: The allowed nodes to allocate from
 *
 * This is the main entry point for direct page compaction.
 */
unsigned long try_to_compact_pages(struct zonelist *zonelist,
			int order, gfp_t gfp_mask, nodemask_t *nodemask)
{
	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
	int may_enter_fs = gfp_mask & __GFP_FS;
	int may_perform_io = gfp_mask & __GFP_IO;
	unsigned long watermark;
	struct zoneref *z;
	struct zone *zone;
	int rc = COMPACT_SKIPPED;

	/*
	 * Check whether it is worth even starting compaction. The order check is
	 * made because an assumption is made that the page allocator can satisfy
	 * the "cheaper" orders without taking special steps
	 */
	if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io)
		return rc;

	count_vm_event(COMPACTSTALL);

	/* Compact each zone in the list */
	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
								nodemask) {
		int fragindex;
		int status;

		/*
		 * Watermarks for order-0 must be met for compaction. Note
		 * the 2UL. This is because during migration, copies of
		 * pages need to be allocated and for a short time, the
		 * footprint is higher
		 */
		watermark = low_wmark_pages(zone) + (2UL << order);
		if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
			continue;

		/*
		 * fragmentation index determines if allocation failures are
		 * due to low memory or external fragmentation
		 *
		 * index of -1 implies allocations might succeed depending
		 * 	on watermarks
		 * index towards 0 implies failure is due to lack of memory
		 * index towards 1000 implies failure is due to fragmentation
		 *
		 * Only compact if a failure would be due to fragmentation.
		 */
		fragindex = fragmentation_index(zone, order);
		if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
			continue;

		if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
			rc = COMPACT_PARTIAL;
			break;
		}

		status = compact_zone_order(zone, order, gfp_mask);
		rc = max(status, rc);

		if (zone_watermark_ok(zone, order, watermark, 0, 0))
			break;
	}

	return rc;
}


/* Compact all zones within a node */
static int compact_node(int nid)
{
	int zoneid;
	pg_data_t *pgdat;
	struct zone *zone;

	if (nid < 0 || nid >= nr_node_ids || !node_online(nid))
		return -EINVAL;
	pgdat = NODE_DATA(nid);

	/* Flush pending updates to the LRU lists */
	lru_add_drain_all();

	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
		struct compact_control cc = {
			.nr_freepages = 0,
			.nr_migratepages = 0,
			.order = -1,
		};

		zone = &pgdat->node_zones[zoneid];
		if (!populated_zone(zone))
			continue;

		cc.zone = zone;
		INIT_LIST_HEAD(&cc.freepages);
		INIT_LIST_HEAD(&cc.migratepages);

		compact_zone(zone, &cc);

		VM_BUG_ON(!list_empty(&cc.freepages));
		VM_BUG_ON(!list_empty(&cc.migratepages));
	}

	return 0;
}

/* Compact all nodes in the system */
static int compact_nodes(void)
{
	int nid;

	for_each_online_node(nid)
		compact_node(nid);

	return COMPACT_COMPLETE;
}

/* The written value is actually unused, all memory is compacted */
int sysctl_compact_memory;

/* This is the entry point for compacting all nodes via /proc/sys/vm */
int sysctl_compaction_handler(struct ctl_table *table, int write,
			void __user *buffer, size_t *length, loff_t *ppos)
{
	if (write)
		return compact_nodes();

	return 0;
}

int sysctl_extfrag_handler(struct ctl_table *table, int write,
			void __user *buffer, size_t *length, loff_t *ppos)
{
	proc_dointvec_minmax(table, write, buffer, length, ppos);

	return 0;
}

#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
ssize_t sysfs_compact_node(struct sys_device *dev,
			struct sysdev_attribute *attr,
			const char *buf, size_t count)
{
	compact_node(dev->id);

	return count;
}
static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);

int compaction_register_node(struct node *node)
{
	return sysdev_create_file(&node->sysdev, &attr_compact);
}

void compaction_unregister_node(struct node *node)
{
	return sysdev_remove_file(&node->sysdev, &attr_compact);
}
#endif /* CONFIG_SYSFS && CONFIG_NUMA */
Commit	Line	Data
748446bb MG	1	/*
	2	* linux/mm/compaction.c
	3	*
	4	* Memory compaction for the reduction of external fragmentation. Note that
	5	* this heavily depends upon page migration to do all the real heavy
	6	* lifting
	7	*
	8	* Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie>
	9	*/
	10	#include <linux/swap.h>
	11	#include <linux/migrate.h>
	12	#include <linux/compaction.h>
	13	#include <linux/mm_inline.h>
	14	#include <linux/backing-dev.h>
76ab0f53	15	#include <linux/sysctl.h>
ed4a6d7f	16	#include <linux/sysfs.h>
748446bb MG	17	#include "internal.h"
	18
	19	/*
	20	* compact_control is used to track pages being migrated and the free pages
	21	* they are being migrated to during memory compaction. The free_pfn starts
	22	* at the end of a zone and migrate_pfn begins at the start. Movable pages
	23	* are moved to the end of a zone during a compaction run and the run
	24	* completes when free_pfn <= migrate_pfn
	25	*/
	26	struct compact_control {
	27	struct list_head freepages; /* List of free pages to migrate to */
	28	struct list_head migratepages; /* List of pages being migrated */
	29	unsigned long nr_freepages; /* Number of isolated free pages */
	30	unsigned long nr_migratepages; /* Number of pages to migrate */
	31	unsigned long free_pfn; /* isolate_freepages search base */
	32	unsigned long migrate_pfn; /* isolate_migratepages search base */
	33
	34	/* Account for isolated anon and file pages */
	35	unsigned long nr_anon;
	36	unsigned long nr_file;
	37
56de7263 MG	38	unsigned int order; /* order a direct compactor needs */
56de7263 MG	39	int migratetype; /* MOVABLE, RECLAIMABLE etc */
748446bb MG	40	struct zone *zone;
	41	};
	42
	43	static unsigned long release_freepages(struct list_head *freelist)
	44	{
	45	struct page page, next;
	46	unsigned long count = 0;
	47
	48	list_for_each_entry_safe(page, next, freelist, lru) {
	49	list_del(&page->lru);
	50	__free_page(page);
	51	count++;
	52	}
	53
	54	return count;
	55	}
	56
	57	/* Isolate free pages onto a private freelist. Must hold zone->lock */
	58	static unsigned long isolate_freepages_block(struct zone *zone,
	59	unsigned long blockpfn,
	60	struct list_head *freelist)
	61	{
	62	unsigned long zone_end_pfn, end_pfn;
	63	int total_isolated = 0;
	64	struct page *cursor;
	65
	66	/* Get the last PFN we should scan for free pages at */
	67	zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
	68	end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn);
	69
	70	/* Find the first usable PFN in the block to initialse page cursor */
	71	for (; blockpfn < end_pfn; blockpfn++) {
	72	if (pfn_valid_within(blockpfn))
	73	break;
	74	}
	75	cursor = pfn_to_page(blockpfn);
	76
	77	/* Isolate free pages. This assumes the block is valid */
	78	for (; blockpfn < end_pfn; blockpfn++, cursor++) {
	79	int isolated, i;
	80	struct page *page = cursor;
	81
	82	if (!pfn_valid_within(blockpfn))
	83	continue;
	84
	85	if (!PageBuddy(page))
	86	continue;
	87
	88	/* Found a free page, break it into order-0 pages */
	89	isolated = split_free_page(page);
	90	total_isolated += isolated;
	91	for (i = 0; i < isolated; i++) {
	92	list_add(&page->lru, freelist);
	93	page++;
	94	}
	95
	96	/* If a page was split, advance to the end of it */
	97	if (isolated) {
	98	blockpfn += isolated - 1;
	99	cursor += isolated - 1;
	100	}
	101	}
	102
	103	return total_isolated;
104	}
105
106	/* Returns true if the page is within a block suitable for migration to */
107	static bool suitable_migration_target(struct page *page)
108	{
109
110	int migratetype = get_pageblock_migratetype(page);
111
112	/* Don't interfere with memory hot-remove or the min_free_kbytes blocks */
113	if (migratetype == MIGRATE_ISOLATE \|\| migratetype == MIGRATE_RESERVE)
114	return false;
115
116	/* If the page is a large free page, then allow migration */
117	if (PageBuddy(page) && page_order(page) >= pageblock_order)
118	return true;
119
120	/* If the block is MIGRATE_MOVABLE, allow migration */
121	if (migratetype == MIGRATE_MOVABLE)
122	return true;
123
124	/* Otherwise skip the block */
125	return false;
126	}
127
128	/*
129	* Based on information in the current compact_control, find blocks
130	* suitable for isolating free pages from and then isolate them.
131	*/
132	static void isolate_freepages(struct zone *zone,
133	struct compact_control *cc)
134	{
135	struct page *page;
136	unsigned long high_pfn, low_pfn, pfn;
137	unsigned long flags;
138	int nr_freepages = cc->nr_freepages;
139	struct list_head *freelist = &cc->freepages;
140
141	pfn = cc->free_pfn;
142	low_pfn = cc->migrate_pfn + pageblock_nr_pages;
143	high_pfn = low_pfn;
144
145	/*
146	* Isolate free pages until enough are available to migrate the
147	* pages on cc->migratepages. We stop searching if the migrate
148	* and free page scanners meet or enough free pages are isolated.
149	*/
150	spin_lock_irqsave(&zone->lock, flags);
151	for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages;
152	pfn -= pageblock_nr_pages) {
153	unsigned long isolated;
154
155	if (!pfn_valid(pfn))
156	continue;
157
158	/*
159	* Check for overlapping nodes/zones. It's possible on some
160	* configurations to have a setup like
161	* node0 node1 node0
162	* i.e. it's possible that all pages within a zones range of
163	* pages do not belong to a single zone.
164	*/
165	page = pfn_to_page(pfn);
166	if (page_zone(page) != zone)
167	continue;
168
169	/* Check the block is suitable for migration */
170	if (!suitable_migration_target(page))
171	continue;
172
173	/* Found a block suitable for isolating free pages from */
174	isolated = isolate_freepages_block(zone, pfn, freelist);
175	nr_freepages += isolated;
176
177	/*
178	* Record the highest PFN we isolated pages from. When next
179	* looking for free pages, the search will restart here as
180	* page migration may have returned some pages to the allocator
181	*/
182	if (isolated)
183	high_pfn = max(high_pfn, pfn);
184	}
185	spin_unlock_irqrestore(&zone->lock, flags);
186
187	/* split_free_page does not map the pages */
188	list_for_each_entry(page, freelist, lru) {
189	arch_alloc_page(page, 0);
190	kernel_map_pages(page, 1, 1);
191	}
192
193	cc->free_pfn = high_pfn;
194	cc->nr_freepages = nr_freepages;
195	}
196
197	/* Update the number of anon and file isolated pages in the zone */
198	static void acct_isolated(struct zone zone, struct compact_control cc)
199	{
200	struct page *page;
201	unsigned int count[NR_LRU_LISTS] = { 0, };
202
203	list_for_each_entry(page, &cc->migratepages, lru) {
204	int lru = page_lru_base_type(page);
205	count[lru]++;
206	}
207
208	cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
209	cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
210	__mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
211	__mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
212	}
213
214	/* Similar to reclaim, but different enough that they don't share logic */
215	static bool too_many_isolated(struct zone *zone)
216	{
bc693045	217	unsigned long active, inactive, isolated;
748446bb MG	218
	219	inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
	220	zone_page_state(zone, NR_INACTIVE_ANON);
bc693045 MK	221	active = zone_page_state(zone, NR_ACTIVE_FILE) +
bc693045 MK	222	zone_page_state(zone, NR_ACTIVE_ANON);
748446bb MG	223	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
	224	zone_page_state(zone, NR_ISOLATED_ANON);
	225
bc693045	226	return isolated > (inactive + active) / 2;
748446bb MG	227	}
	228
	229	/*
	230	* Isolate all pages that can be migrated from the block pointed to by
	231	* the migrate scanner within compact_control.
	232	*/
	233	static unsigned long isolate_migratepages(struct zone *zone,
	234	struct compact_control *cc)
	235	{
	236	unsigned long low_pfn, end_pfn;
	237	struct list_head *migratelist = &cc->migratepages;
	238
	239	/* Do not scan outside zone boundaries */
	240	low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
	241
	242	/* Only scan within a pageblock boundary */
	243	end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages);
	244
	245	/* Do not cross the free scanner or scan within a memory hole */
	246	if (end_pfn > cc->free_pfn \|\| !pfn_valid(low_pfn)) {
	247	cc->migrate_pfn = end_pfn;
	248	return 0;
	249	}
	250
	251	/*
	252	* Ensure that there are not too many pages isolated from the LRU
	253	* list by either parallel reclaimers or compaction. If there are,
	254	* delay for some time until fewer pages are isolated
	255	*/
	256	while (unlikely(too_many_isolated(zone))) {
	257	congestion_wait(BLK_RW_ASYNC, HZ/10);
	258
	259	if (fatal_signal_pending(current))
	260	return 0;
	261	}
	262
	263	/* Time to isolate some pages for migration */
	264	spin_lock_irq(&zone->lru_lock);
	265	for (; low_pfn < end_pfn; low_pfn++) {
	266	struct page *page;
	267	if (!pfn_valid_within(low_pfn))
	268	continue;
	269
	270	/* Get the page and skip if free */
	271	page = pfn_to_page(low_pfn);
	272	if (PageBuddy(page))
	273	continue;
	274
	275	/* Try isolate the page */
	276	if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
	277	continue;
	278
	279	/* Successfully isolated */
	280	del_page_from_lru_list(zone, page, page_lru(page));
	281	list_add(&page->lru, migratelist);
	282	mem_cgroup_del_lru(page);
	283	cc->nr_migratepages++;
	284
	285	/* Avoid isolating too much */
	286	if (cc->nr_migratepages == COMPACT_CLUSTER_MAX)
	287	break;
	288	}
	289
	290	acct_isolated(zone, cc);
291
292	spin_unlock_irq(&zone->lru_lock);
293	cc->migrate_pfn = low_pfn;
294
295	return cc->nr_migratepages;
296	}
297
298	/*
299	* This is a migrate-callback that "allocates" freepages by taking pages
300	* from the isolated freelists in the block we are migrating to.
301	*/
302	static struct page compaction_alloc(struct page migratepage,
303	unsigned long data,
304	int **result)
305	{
306	struct compact_control cc = (struct compact_control )data;
307	struct page *freepage;
308
309	/* Isolate free pages if necessary */
310	if (list_empty(&cc->freepages)) {
311	isolate_freepages(cc->zone, cc);
312
313	if (list_empty(&cc->freepages))
314	return NULL;
315	}
316
317	freepage = list_entry(cc->freepages.next, struct page, lru);
318	list_del(&freepage->lru);
319	cc->nr_freepages--;
320
321	return freepage;
322	}
323
324	/*
325	* We cannot control nr_migratepages and nr_freepages fully when migration is
326	* running as migrate_pages() has no knowledge of compact_control. When
327	* migration is complete, we count the number of pages on the lists by hand.
328	*/
329	static void update_nr_listpages(struct compact_control *cc)
330	{
331	int nr_migratepages = 0;
332	int nr_freepages = 0;
333	struct page *page;
334
335	list_for_each_entry(page, &cc->migratepages, lru)
336	nr_migratepages++;
337	list_for_each_entry(page, &cc->freepages, lru)
338	nr_freepages++;
339
340	cc->nr_migratepages = nr_migratepages;
341	cc->nr_freepages = nr_freepages;
342	}
343
344	static int compact_finished(struct zone *zone,
345	struct compact_control *cc)
346	{
56de7263 MG	347	unsigned int order;
	348	unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order);
	349
748446bb MG	350	if (fatal_signal_pending(current))
	351	return COMPACT_PARTIAL;
	352
	353	/* Compaction run completes if the migrate and free scanner meet */
	354	if (cc->free_pfn <= cc->migrate_pfn)
	355	return COMPACT_COMPLETE;
	356
56de7263 MG	357	/* Compaction run is not finished if the watermark is not met */
	358	if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0))
	359	return COMPACT_CONTINUE;
	360
	361	if (cc->order == -1)
	362	return COMPACT_CONTINUE;
	363
	364	/* Direct compactor: Is a suitable page free? */
	365	for (order = cc->order; order < MAX_ORDER; order++) {
	366	/* Job done if page is free of the right migratetype */
	367	if (!list_empty(&zone->free_area[order].free_list[cc->migratetype]))
	368	return COMPACT_PARTIAL;
	369
	370	/* Job done if allocation would set block type */
	371	if (order >= pageblock_order && zone->free_area[order].nr_free)
	372	return COMPACT_PARTIAL;
	373	}
	374
748446bb MG	375	return COMPACT_CONTINUE;
	376	}
	377
	378	static int compact_zone(struct zone zone, struct compact_control cc)
	379	{
	380	int ret;
	381
	382	/* Setup to move all movable pages to the end of the zone */
	383	cc->migrate_pfn = zone->zone_start_pfn;
	384	cc->free_pfn = cc->migrate_pfn + zone->spanned_pages;
	385	cc->free_pfn &= ~(pageblock_nr_pages-1);
	386
	387	migrate_prep_local();
	388
	389	while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
	390	unsigned long nr_migrate, nr_remaining;
	391
	392	if (!isolate_migratepages(zone, cc))
	393	continue;
	394
	395	nr_migrate = cc->nr_migratepages;
	396	migrate_pages(&cc->migratepages, compaction_alloc,
	397	(unsigned long)cc, 0);
	398	update_nr_listpages(cc);
	399	nr_remaining = cc->nr_migratepages;
	400
	401	count_vm_event(COMPACTBLOCKS);
	402	count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining);
	403	if (nr_remaining)
	404	count_vm_events(COMPACTPAGEFAILED, nr_remaining);
	405
	406	/* Release LRU pages not migrated */
	407	if (!list_empty(&cc->migratepages)) {
	408	putback_lru_pages(&cc->migratepages);
	409	cc->nr_migratepages = 0;
	410	}
	411
	412	}
	413
	414	/* Release free pages and check accounting */
	415	cc->nr_freepages -= release_freepages(&cc->freepages);
	416	VM_BUG_ON(cc->nr_freepages != 0);
	417
	418	return ret;
	419	}
76ab0f53	420
56de7263 MG	421	static unsigned long compact_zone_order(struct zone *zone,
	422	int order, gfp_t gfp_mask)
	423	{
	424	struct compact_control cc = {
	425	.nr_freepages = 0,
	426	.nr_migratepages = 0,
	427	.order = order,
	428	.migratetype = allocflags_to_migratetype(gfp_mask),
	429	.zone = zone,
	430	};
	431	INIT_LIST_HEAD(&cc.freepages);
	432	INIT_LIST_HEAD(&cc.migratepages);
	433
	434	return compact_zone(zone, &cc);
	435	}
	436
5e771905 MG	437	int sysctl_extfrag_threshold = 500;
5e771905 MG	438
56de7263 MG	439	/**
	440	* try_to_compact_pages - Direct compact to satisfy a high-order allocation
	441	* @zonelist: The zonelist used for the current allocation
	442	* @order: The order of the current allocation
	443	* @gfp_mask: The GFP mask of the current allocation
	444	* @nodemask: The allowed nodes to allocate from
	445	*
	446	* This is the main entry point for direct page compaction.
	447	*/
	448	unsigned long try_to_compact_pages(struct zonelist *zonelist,
	449	int order, gfp_t gfp_mask, nodemask_t *nodemask)
	450	{
	451	enum zone_type high_zoneidx = gfp_zone(gfp_mask);
	452	int may_enter_fs = gfp_mask & __GFP_FS;
	453	int may_perform_io = gfp_mask & __GFP_IO;
	454	unsigned long watermark;
	455	struct zoneref *z;
	456	struct zone *zone;
	457	int rc = COMPACT_SKIPPED;
	458
	459	/*
	460	* Check whether it is worth even starting compaction. The order check is
	461	* made because an assumption is made that the page allocator can satisfy
	462	* the "cheaper" orders without taking special steps
	463	*/
	464	if (order <= PAGE_ALLOC_COSTLY_ORDER \|\| !may_enter_fs \|\| !may_perform_io)
	465	return rc;
	466
	467	count_vm_event(COMPACTSTALL);
	468
	469	/* Compact each zone in the list */
	470	for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
	471	nodemask) {
	472	int fragindex;
	473	int status;
	474
	475	/*
	476	* Watermarks for order-0 must be met for compaction. Note
	477	* the 2UL. This is because during migration, copies of
	478	* pages need to be allocated and for a short time, the
	479	* footprint is higher
	480	*/
	481	watermark = low_wmark_pages(zone) + (2UL << order);
	482	if (!zone_watermark_ok(zone, 0, watermark, 0, 0))
	483	continue;
	484
	485	/*
	486	* fragmentation index determines if allocation failures are
	487	* due to low memory or external fragmentation
	488	*
	489	* index of -1 implies allocations might succeed depending
	490	* on watermarks
	491	* index towards 0 implies failure is due to lack of memory
	492	* index towards 1000 implies failure is due to fragmentation
	493	*
	494	* Only compact if a failure would be due to fragmentation.
	495	*/
	496	fragindex = fragmentation_index(zone, order);
5e771905	497	if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
56de7263 MG	498	continue;
	499
	500	if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) {
	501	rc = COMPACT_PARTIAL;
	502	break;
	503	}
	504
	505	status = compact_zone_order(zone, order, gfp_mask);
	506	rc = max(status, rc);
	507
	508	if (zone_watermark_ok(zone, order, watermark, 0, 0))
	509	break;
	510	}
	511
	512	return rc;
	513	}
	514
	515
76ab0f53 MG	516	/* Compact all zones within a node */
	517	static int compact_node(int nid)
	518	{
	519	int zoneid;
	520	pg_data_t *pgdat;
	521	struct zone *zone;
	522
	523	if (nid < 0 \|\| nid >= nr_node_ids \|\| !node_online(nid))
	524	return -EINVAL;
	525	pgdat = NODE_DATA(nid);
	526
	527	/* Flush pending updates to the LRU lists */
	528	lru_add_drain_all();
	529
	530	for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
	531	struct compact_control cc = {
	532	.nr_freepages = 0,
	533	.nr_migratepages = 0,
56de7263	534	.order = -1,
76ab0f53 MG	535	};
	536
	537	zone = &pgdat->node_zones[zoneid];
	538	if (!populated_zone(zone))
	539	continue;
	540
	541	cc.zone = zone;
	542	INIT_LIST_HEAD(&cc.freepages);
	543	INIT_LIST_HEAD(&cc.migratepages);
	544
	545	compact_zone(zone, &cc);
	546
	547	VM_BUG_ON(!list_empty(&cc.freepages));
	548	VM_BUG_ON(!list_empty(&cc.migratepages));
	549	}
	550
	551	return 0;
	552	}
	553
	554	/* Compact all nodes in the system */
	555	static int compact_nodes(void)
	556	{
	557	int nid;
	558
	559	for_each_online_node(nid)
	560	compact_node(nid);
	561
	562	return COMPACT_COMPLETE;
	563	}
	564
	565	/* The written value is actually unused, all memory is compacted */
	566	int sysctl_compact_memory;
	567
	568	/* This is the entry point for compacting all nodes via /proc/sys/vm */
	569	int sysctl_compaction_handler(struct ctl_table *table, int write,
	570	void __user buffer, size_t length, loff_t *ppos)
	571	{
	572	if (write)
	573	return compact_nodes();
	574
	575	return 0;
	576	}
ed4a6d7f	577
5e771905 MG	578	int sysctl_extfrag_handler(struct ctl_table *table, int write,
	579	void __user buffer, size_t length, loff_t *ppos)
	580	{
	581	proc_dointvec_minmax(table, write, buffer, length, ppos);
	582
	583	return 0;
	584	}
	585
ed4a6d7f MG	586	#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
	587	ssize_t sysfs_compact_node(struct sys_device *dev,
	588	struct sysdev_attribute *attr,
	589	const char *buf, size_t count)
	590	{
	591	compact_node(dev->id);
	592
	593	return count;
	594	}
	595	static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node);
	596
	597	int compaction_register_node(struct node *node)
	598	{
	599	return sysdev_create_file(&node->sysdev, &attr_compact);
	600	}
	601
	602	void compaction_unregister_node(struct node *node)
	603	{
	604	return sysdev_remove_file(&node->sysdev, &attr_compact);
	605	}
	606	#endif /* CONFIG_SYSFS && CONFIG_NUMA */