[net-next-2.6.git] / mm / memory_hotplug.c

/*
 *  linux/mm/memory_hotplug.c
 *
 *  Copyright (C)
 */

#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/highmem.h>
#include <linux/vmalloc.h>
#include <linux/ioport.h>
#include <linux/cpuset.h>
#include <linux/delay.h>
#include <linux/migrate.h>
#include <linux/page-isolation.h>

#include <asm/tlbflush.h>

/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
	struct resource *res;
	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
	BUG_ON(!res);

	res->name = "System RAM";
	res->start = start;
	res->end = start + size - 1;
	res->flags = IORESOURCE_MEM;
	if (request_resource(&iomem_resource, res) < 0) {
		printk("System RAM resource %llx - %llx cannot be added\n",
		(unsigned long long)res->start, (unsigned long long)res->end);
		kfree(res);
		res = NULL;
	}
	return res;
}

static void release_memory_resource(struct resource *res)
{
	if (!res)
		return;
	release_resource(res);
	kfree(res);
	return;
}


#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	int nr_pages = PAGES_PER_SECTION;
	int nid = pgdat->node_id;
	int zone_type;

	zone_type = zone - pgdat->node_zones;
	if (!zone->wait_table) {
		int ret = 0;
		ret = init_currently_empty_zone(zone, phys_start_pfn,
						nr_pages, MEMMAP_HOTPLUG);
		if (ret < 0)
			return ret;
	}
	memmap_init_zone(nr_pages, nid, zone_type,
			 phys_start_pfn, MEMMAP_HOTPLUG);
	return 0;
}

static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
{
	int nr_pages = PAGES_PER_SECTION;
	int ret;

	if (pfn_valid(phys_start_pfn))
		return -EEXIST;

	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);

	if (ret < 0)
		return ret;

	ret = __add_zone(zone, phys_start_pfn);

	if (ret < 0)
		return ret;

	return register_new_memory(__pfn_to_section(phys_start_pfn));
}

/*
 * Reasonably generic function for adding memory.  It is
 * expected that archs that support memory hotplug will
 * call this function after deciding the zone to which to
 * add the new pages.
 */
int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
		 unsigned long nr_pages)
{
	unsigned long i;
	int err = 0;
	int start_sec, end_sec;
	/* during initialize mem_map, align hot-added range to section */
	start_sec = pfn_to_section_nr(phys_start_pfn);
	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);

	for (i = start_sec; i <= end_sec; i++) {
		err = __add_section(zone, i << PFN_SECTION_SHIFT);

		/*
		 * EEXIST is finally dealt with by ioresource collision
		 * check. see add_memory() => register_memory_resource()
		 * Warning will be printed if there is collision.
		 */
		if (err && (err != -EEXIST))
			break;
		err = 0;
	}

	return err;
}
EXPORT_SYMBOL_GPL(__add_pages);

static void grow_zone_span(struct zone *zone,
		unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long old_zone_end_pfn;

	zone_span_writelock(zone);

	old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
	if (start_pfn < zone->zone_start_pfn)
		zone->zone_start_pfn = start_pfn;

	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
				zone->zone_start_pfn;

	zone_span_writeunlock(zone);
}

static void grow_pgdat_span(struct pglist_data *pgdat,
		unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long old_pgdat_end_pfn =
		pgdat->node_start_pfn + pgdat->node_spanned_pages;

	if (start_pfn < pgdat->node_start_pfn)
		pgdat->node_start_pfn = start_pfn;

	pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
					pgdat->node_start_pfn;
}

static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
			void *arg)
{
	unsigned long i;
	unsigned long onlined_pages = *(unsigned long *)arg;
	struct page *page;
	if (PageReserved(pfn_to_page(start_pfn)))
		for (i = 0; i < nr_pages; i++) {
			page = pfn_to_page(start_pfn + i);
			online_page(page);
			onlined_pages++;
		}
	*(unsigned long *)arg = onlined_pages;
	return 0;
}


int online_pages(unsigned long pfn, unsigned long nr_pages)
{
	unsigned long flags;
	unsigned long onlined_pages = 0;
	struct zone *zone;
	int need_zonelists_rebuild = 0;
	int nid;
	int ret;
	struct memory_notify arg;

	arg.start_pfn = pfn;
	arg.nr_pages = nr_pages;
	arg.status_change_nid = -1;

	nid = page_to_nid(pfn_to_page(pfn));
	if (node_present_pages(nid) == 0)
		arg.status_change_nid = nid;

	ret = memory_notify(MEM_GOING_ONLINE, &arg);
	ret = notifier_to_errno(ret);
	if (ret) {
		memory_notify(MEM_CANCEL_ONLINE, &arg);
		return ret;
	}
	/*
	 * This doesn't need a lock to do pfn_to_page().
	 * The section can't be removed here because of the
	 * memory_block->state_sem.
	 */
	zone = page_zone(pfn_to_page(pfn));
	pgdat_resize_lock(zone->zone_pgdat, &flags);
	grow_zone_span(zone, pfn, pfn + nr_pages);
	grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
	pgdat_resize_unlock(zone->zone_pgdat, &flags);

	/*
	 * If this zone is not populated, then it is not in zonelist.
	 * This means the page allocator ignores this zone.
	 * So, zonelist must be updated after online.
	 */
	if (!populated_zone(zone))
		need_zonelists_rebuild = 1;

	walk_memory_resource(pfn, nr_pages, &onlined_pages,
		online_pages_range);
	zone->present_pages += onlined_pages;
	zone->zone_pgdat->node_present_pages += onlined_pages;

	setup_per_zone_pages_min();
	if (onlined_pages) {
		kswapd_run(zone_to_nid(zone));
		node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
	}

	if (need_zonelists_rebuild)
		build_all_zonelists();
	vm_total_pages = nr_free_pagecache_pages();
	writeback_set_ratelimit();

	if (onlined_pages)
		memory_notify(MEM_ONLINE, &arg);

	return 0;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */

static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
{
	struct pglist_data *pgdat;
	unsigned long zones_size[MAX_NR_ZONES] = {0};
	unsigned long zholes_size[MAX_NR_ZONES] = {0};
	unsigned long start_pfn = start >> PAGE_SHIFT;

	pgdat = arch_alloc_nodedata(nid);
	if (!pgdat)
		return NULL;

	arch_refresh_nodedata(nid, pgdat);

	/* we can use NODE_DATA(nid) from here */

	/* init node's zones as empty zones, we don't have any present pages.*/
	free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);

	return pgdat;
}

static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
{
	arch_refresh_nodedata(nid, NULL);
	arch_free_nodedata(pgdat);
	return;
}


int add_memory(int nid, u64 start, u64 size)
{
	pg_data_t *pgdat = NULL;
	int new_pgdat = 0;
	struct resource *res;
	int ret;

	res = register_memory_resource(start, size);
	if (!res)
		return -EEXIST;

	if (!node_online(nid)) {
		pgdat = hotadd_new_pgdat(nid, start);
		if (!pgdat)
			return -ENOMEM;
		new_pgdat = 1;
	}

	/* call arch's memory hotadd */
	ret = arch_add_memory(nid, start, size);

	if (ret < 0)
		goto error;

	/* we online node here. we can't roll back from here. */
	node_set_online(nid);

	cpuset_track_online_nodes();

	if (new_pgdat) {
		ret = register_one_node(nid);
		/*
		 * If sysfs file of new node can't create, cpu on the node
		 * can't be hot-added. There is no rollback way now.
		 * So, check by BUG_ON() to catch it reluctantly..
		 */
		BUG_ON(ret);
	}

	return ret;
error:
	/* rollback pgdat allocation and others */
	if (new_pgdat)
		rollback_node_hotadd(nid, pgdat);
	if (res)
		release_memory_resource(res);

	return ret;
}
EXPORT_SYMBOL_GPL(add_memory);

#ifdef CONFIG_MEMORY_HOTREMOVE
/*
 * Confirm all pages in a range [start, end) is belongs to the same zone.
 */
static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;
	struct zone *zone = NULL;
	struct page *page;
	int i;
	for (pfn = start_pfn;
	     pfn < end_pfn;
	     pfn += MAX_ORDER_NR_PAGES) {
		i = 0;
		/* This is just a CONFIG_HOLES_IN_ZONE check.*/
		while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
			i++;
		if (i == MAX_ORDER_NR_PAGES)
			continue;
		page = pfn_to_page(pfn + i);
		if (zone && page_zone(page) != zone)
			return 0;
		zone = page_zone(page);
	}
	return 1;
}

/*
 * Scanning pfn is much easier than scanning lru list.
 * Scan pfn from start to end and Find LRU page.
 */
int scan_lru_pages(unsigned long start, unsigned long end)
{
	unsigned long pfn;
	struct page *page;
	for (pfn = start; pfn < end; pfn++) {
		if (pfn_valid(pfn)) {
			page = pfn_to_page(pfn);
			if (PageLRU(page))
				return pfn;
		}
	}
	return 0;
}

static struct page *
hotremove_migrate_alloc(struct page *page,
			unsigned long private,
			int **x)
{
	/* This should be improoooooved!! */
	return alloc_page(GFP_HIGHUSER_PAGECACHE);
}


#define NR_OFFLINE_AT_ONCE_PAGES	(256)
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;
	struct page *page;
	int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
	int not_managed = 0;
	int ret = 0;
	LIST_HEAD(source);

	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
		if (!pfn_valid(pfn))
			continue;
		page = pfn_to_page(pfn);
		if (!page_count(page))
			continue;
		/*
		 * We can skip free pages. And we can only deal with pages on
		 * LRU.
		 */
		ret = isolate_lru_page(page, &source);
		if (!ret) { /* Success */
			move_pages--;
		} else {
			/* Becasue we don't have big zone->lock. we should
			   check this again here. */
			if (page_count(page))
				not_managed++;
#ifdef CONFIG_DEBUG_VM
			printk(KERN_INFO "removing from LRU failed"
					 " %lx/%d/%lx\n",
				pfn, page_count(page), page->flags);
#endif
		}
	}
	ret = -EBUSY;
	if (not_managed) {
		if (!list_empty(&source))
			putback_lru_pages(&source);
		goto out;
	}
	ret = 0;
	if (list_empty(&source))
		goto out;
	/* this function returns # of failed pages */
	ret = migrate_pages(&source, hotremove_migrate_alloc, 0);

out:
	return ret;
}

/*
 * remove from free_area[] and mark all as Reserved.
 */
static int
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
			void *data)
{
	__offline_isolated_pages(start, start + nr_pages);
	return 0;
}

static void
offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
	walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL,
				offline_isolated_pages_cb);
}

/*
 * Check all pages in range, recoreded as memory resource, are isolated.
 */
static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
			void *data)
{
	int ret;
	long offlined = *(long *)data;
	ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
	offlined = nr_pages;
	if (!ret)
		*(long *)data += offlined;
	return ret;
}

static long
check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
{
	long offlined = 0;
	int ret;

	ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined,
			check_pages_isolated_cb);
	if (ret < 0)
		offlined = (long)ret;
	return offlined;
}

extern void drain_all_local_pages(void);

int offline_pages(unsigned long start_pfn,
		  unsigned long end_pfn, unsigned long timeout)
{
	unsigned long pfn, nr_pages, expire;
	long offlined_pages;
	int ret, drain, retry_max, node;
	struct zone *zone;
	struct memory_notify arg;

	BUG_ON(start_pfn >= end_pfn);
	/* at least, alignment against pageblock is necessary */
	if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
		return -EINVAL;
	if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
		return -EINVAL;
	/* This makes hotplug much easier...and readable.
	   we assume this for now. .*/
	if (!test_pages_in_a_zone(start_pfn, end_pfn))
		return -EINVAL;

	zone = page_zone(pfn_to_page(start_pfn));
	node = zone_to_nid(zone);
	nr_pages = end_pfn - start_pfn;

	/* set above range as isolated */
	ret = start_isolate_page_range(start_pfn, end_pfn);
	if (ret)
		return ret;

	arg.start_pfn = start_pfn;
	arg.nr_pages = nr_pages;
	arg.status_change_nid = -1;
	if (nr_pages >= node_present_pages(node))
		arg.status_change_nid = node;

	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
	ret = notifier_to_errno(ret);
	if (ret)
		goto failed_removal;

	pfn = start_pfn;
	expire = jiffies + timeout;
	drain = 0;
	retry_max = 5;
repeat:
	/* start memory hot removal */
	ret = -EAGAIN;
	if (time_after(jiffies, expire))
		goto failed_removal;
	ret = -EINTR;
	if (signal_pending(current))
		goto failed_removal;
	ret = 0;
	if (drain) {
		lru_add_drain_all();
		flush_scheduled_work();
		cond_resched();
		drain_all_local_pages();
	}

	pfn = scan_lru_pages(start_pfn, end_pfn);
	if (pfn) { /* We have page on LRU */
		ret = do_migrate_range(pfn, end_pfn);
		if (!ret) {
			drain = 1;
			goto repeat;
		} else {
			if (ret < 0)
				if (--retry_max == 0)
					goto failed_removal;
			yield();
			drain = 1;
			goto repeat;
		}
	}
	/* drain all zone's lru pagevec, this is asyncronous... */
	lru_add_drain_all();
	flush_scheduled_work();
	yield();
	/* drain pcp pages , this is synchrouns. */
	drain_all_local_pages();
	/* check again */
	offlined_pages = check_pages_isolated(start_pfn, end_pfn);
	if (offlined_pages < 0) {
		ret = -EBUSY;
		goto failed_removal;
	}
	printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
	/* Ok, all of our target is islaoted.
	   We cannot do rollback at this point. */
	offline_isolated_pages(start_pfn, end_pfn);
	/* reset pagetype flags and makes migrate type to be MOVABLE */
	undo_isolate_page_range(start_pfn, end_pfn);
	/* removal success */
	zone->present_pages -= offlined_pages;
	zone->zone_pgdat->node_present_pages -= offlined_pages;
	totalram_pages -= offlined_pages;
	num_physpages -= offlined_pages;

	vm_total_pages = nr_free_pagecache_pages();
	writeback_set_ratelimit();

	memory_notify(MEM_OFFLINE, &arg);
	return 0;

failed_removal:
	printk(KERN_INFO "memory offlining %lx to %lx failed\n",
		start_pfn, end_pfn);
	memory_notify(MEM_CANCEL_OFFLINE, &arg);
	/* pushback to free area */
	undo_isolate_page_range(start_pfn, end_pfn);

	return ret;
}
#else
int remove_memory(u64 start, u64 size)
{
	return -EINVAL;
}
EXPORT_SYMBOL_GPL(remove_memory);
#endif /* CONFIG_MEMORY_HOTREMOVE */
Commit	Line	Data
3947be19 DH	1	/*
	2	* linux/mm/memory_hotplug.c
	3	*
	4	* Copyright (C)
	5	*/
	6
3947be19 DH	7	#include <linux/stddef.h>
	8	#include <linux/mm.h>
	9	#include <linux/swap.h>
	10	#include <linux/interrupt.h>
	11	#include <linux/pagemap.h>
	12	#include <linux/bootmem.h>
	13	#include <linux/compiler.h>
	14	#include <linux/module.h>
	15	#include <linux/pagevec.h>
2d1d43f6	16	#include <linux/writeback.h>
3947be19 DH	17	#include <linux/slab.h>
	18	#include <linux/sysctl.h>
	19	#include <linux/cpu.h>
	20	#include <linux/memory.h>
	21	#include <linux/memory_hotplug.h>
	22	#include <linux/highmem.h>
	23	#include <linux/vmalloc.h>
0a547039	24	#include <linux/ioport.h>
38837fc7	25	#include <linux/cpuset.h>
0c0e6195 KH	26	#include <linux/delay.h>
	27	#include <linux/migrate.h>
	28	#include <linux/page-isolation.h>
3947be19 DH	29
	30	#include <asm/tlbflush.h>
	31
45e0b78b KM	32	/* add this memory to iomem resource */
	33	static struct resource *register_memory_resource(u64 start, u64 size)
	34	{
	35	struct resource *res;
	36	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
	37	BUG_ON(!res);
	38
	39	res->name = "System RAM";
	40	res->start = start;
	41	res->end = start + size - 1;
	42	res->flags = IORESOURCE_MEM;
	43	if (request_resource(&iomem_resource, res) < 0) {
	44	printk("System RAM resource %llx - %llx cannot be added\n",
	45	(unsigned long long)res->start, (unsigned long long)res->end);
	46	kfree(res);
	47	res = NULL;
	48	}
	49	return res;
	50	}
	51
	52	static void release_memory_resource(struct resource *res)
	53	{
	54	if (!res)
	55	return;
	56	release_resource(res);
	57	kfree(res);
	58	return;
	59	}
	60
	61
53947027	62	#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
718127cc	63	static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
3947be19 DH	64	{
	65	struct pglist_data *pgdat = zone->zone_pgdat;
	66	int nr_pages = PAGES_PER_SECTION;
	67	int nid = pgdat->node_id;
	68	int zone_type;
	69
	70	zone_type = zone - pgdat->node_zones;
13466c84	71	if (!zone->wait_table) {
718127cc	72	int ret = 0;
a2f3aa02 DH	73	ret = init_currently_empty_zone(zone, phys_start_pfn,
a2f3aa02 DH	74	nr_pages, MEMMAP_HOTPLUG);
718127cc YG	75	if (ret < 0)
	76	return ret;
	77	}
a2f3aa02 DH	78	memmap_init_zone(nr_pages, nid, zone_type,
a2f3aa02 DH	79	phys_start_pfn, MEMMAP_HOTPLUG);
718127cc	80	return 0;
3947be19 DH	81	}
3947be19 DH	82
3947be19 DH	83	static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
3947be19 DH	84	{
3947be19	85	int nr_pages = PAGES_PER_SECTION;
3947be19 DH	86	int ret;
3947be19 DH	87
ebd15302 KH	88	if (pfn_valid(phys_start_pfn))
	89	return -EEXIST;
	90
0b0acbec	91	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
3947be19 DH	92
	93	if (ret < 0)
	94	return ret;
	95
718127cc YG	96	ret = __add_zone(zone, phys_start_pfn);
	97
	98	if (ret < 0)
	99	return ret;
	100
3947be19 DH	101	return register_new_memory(__pfn_to_section(phys_start_pfn));
	102	}
	103
	104	/*
	105	* Reasonably generic function for adding memory. It is
	106	* expected that archs that support memory hotplug will
	107	* call this function after deciding the zone to which to
	108	* add the new pages.
	109	*/
	110	int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
	111	unsigned long nr_pages)
	112	{
	113	unsigned long i;
	114	int err = 0;
6f712711 KH	115	int start_sec, end_sec;
	116	/* during initialize mem_map, align hot-added range to section */
	117	start_sec = pfn_to_section_nr(phys_start_pfn);
	118	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
3947be19	119
6f712711 KH	120	for (i = start_sec; i <= end_sec; i++) {
6f712711 KH	121	err = __add_section(zone, i << PFN_SECTION_SHIFT);
3947be19	122
6f712711	123	/*
183ff22b	124	* EEXIST is finally dealt with by ioresource collision
6f712711 KH	125	* check. see add_memory() => register_memory_resource()
6f712711 KH	126	* Warning will be printed if there is collision.
bed120c6 JS	127	*/
bed120c6 JS	128	if (err && (err != -EEXIST))
3947be19	129	break;
6f712711	130	err = 0;
3947be19 DH	131	}
	132
	133	return err;
	134	}
bed120c6	135	EXPORT_SYMBOL_GPL(__add_pages);
3947be19 DH	136
	137	static void grow_zone_span(struct zone *zone,
	138	unsigned long start_pfn, unsigned long end_pfn)
	139	{
	140	unsigned long old_zone_end_pfn;
	141
	142	zone_span_writelock(zone);
	143
	144	old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
	145	if (start_pfn < zone->zone_start_pfn)
	146	zone->zone_start_pfn = start_pfn;
	147
25a6df95 YG	148	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
25a6df95 YG	149	zone->zone_start_pfn;
3947be19 DH	150
	151	zone_span_writeunlock(zone);
	152	}
	153
	154	static void grow_pgdat_span(struct pglist_data *pgdat,
	155	unsigned long start_pfn, unsigned long end_pfn)
	156	{
	157	unsigned long old_pgdat_end_pfn =
	158	pgdat->node_start_pfn + pgdat->node_spanned_pages;
	159
	160	if (start_pfn < pgdat->node_start_pfn)
	161	pgdat->node_start_pfn = start_pfn;
	162
25a6df95 YG	163	pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
25a6df95 YG	164	pgdat->node_start_pfn;
3947be19 DH	165	}
3947be19 DH	166
75884fb1 KH	167	static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
75884fb1 KH	168	void *arg)
3947be19 DH	169	{
3947be19 DH	170	unsigned long i;
75884fb1 KH	171	unsigned long onlined_pages = (unsigned long )arg;
	172	struct page *page;
	173	if (PageReserved(pfn_to_page(start_pfn)))
	174	for (i = 0; i < nr_pages; i++) {
	175	page = pfn_to_page(start_pfn + i);
	176	online_page(page);
	177	onlined_pages++;
	178	}
	179	(unsigned long )arg = onlined_pages;
	180	return 0;
	181	}
	182
	183
	184	int online_pages(unsigned long pfn, unsigned long nr_pages)
	185	{
3947be19 DH	186	unsigned long flags;
	187	unsigned long onlined_pages = 0;
	188	struct zone *zone;
6811378e	189	int need_zonelists_rebuild = 0;
7b78d335 YG	190	int nid;
	191	int ret;
	192	struct memory_notify arg;
	193
	194	arg.start_pfn = pfn;
	195	arg.nr_pages = nr_pages;
	196	arg.status_change_nid = -1;
	197
	198	nid = page_to_nid(pfn_to_page(pfn));
	199	if (node_present_pages(nid) == 0)
	200	arg.status_change_nid = nid;
3947be19	201
7b78d335 YG	202	ret = memory_notify(MEM_GOING_ONLINE, &arg);
	203	ret = notifier_to_errno(ret);
	204	if (ret) {
	205	memory_notify(MEM_CANCEL_ONLINE, &arg);
	206	return ret;
	207	}
3947be19 DH	208	/*
	209	* This doesn't need a lock to do pfn_to_page().
	210	* The section can't be removed here because of the
	211	* memory_block->state_sem.
	212	*/
	213	zone = page_zone(pfn_to_page(pfn));
	214	pgdat_resize_lock(zone->zone_pgdat, &flags);
	215	grow_zone_span(zone, pfn, pfn + nr_pages);
	216	grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
	217	pgdat_resize_unlock(zone->zone_pgdat, &flags);
	218
6811378e YG	219	/*
	220	* If this zone is not populated, then it is not in zonelist.
	221	* This means the page allocator ignores this zone.
	222	* So, zonelist must be updated after online.
	223	*/
	224	if (!populated_zone(zone))
	225	need_zonelists_rebuild = 1;
	226
75884fb1 KH	227	walk_memory_resource(pfn, nr_pages, &onlined_pages,
75884fb1 KH	228	online_pages_range);
3947be19	229	zone->present_pages += onlined_pages;
f2937be5	230	zone->zone_pgdat->node_present_pages += onlined_pages;
3947be19	231
61b13993	232	setup_per_zone_pages_min();
7ea1530a CL	233	if (onlined_pages) {
	234	kswapd_run(zone_to_nid(zone));
	235	node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
	236	}
61b13993	237
6811378e YG	238	if (need_zonelists_rebuild)
6811378e YG	239	build_all_zonelists();
5a4d4361	240	vm_total_pages = nr_free_pagecache_pages();
2d1d43f6	241	writeback_set_ratelimit();
7b78d335 YG	242
	243	if (onlined_pages)
	244	memory_notify(MEM_ONLINE, &arg);
	245
3947be19 DH	246	return 0;
3947be19 DH	247	}
53947027	248	#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
bc02af93	249
9af3c2de YG	250	static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
	251	{
	252	struct pglist_data *pgdat;
	253	unsigned long zones_size[MAX_NR_ZONES] = {0};
	254	unsigned long zholes_size[MAX_NR_ZONES] = {0};
	255	unsigned long start_pfn = start >> PAGE_SHIFT;
	256
	257	pgdat = arch_alloc_nodedata(nid);
	258	if (!pgdat)
	259	return NULL;
	260
	261	arch_refresh_nodedata(nid, pgdat);
	262
	263	/* we can use NODE_DATA(nid) from here */
	264
	265	/* init node's zones as empty zones, we don't have any present pages.*/
	266	free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
	267
	268	return pgdat;
	269	}
	270
	271	static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
	272	{
	273	arch_refresh_nodedata(nid, NULL);
	274	arch_free_nodedata(pgdat);
	275	return;
	276	}
	277
0a547039	278
bc02af93 YG	279	int add_memory(int nid, u64 start, u64 size)
bc02af93 YG	280	{
9af3c2de YG	281	pg_data_t *pgdat = NULL;
9af3c2de YG	282	int new_pgdat = 0;
ebd15302	283	struct resource *res;
bc02af93 YG	284	int ret;
bc02af93 YG	285
ebd15302 KH	286	res = register_memory_resource(start, size);
	287	if (!res)
	288	return -EEXIST;
	289
9af3c2de YG	290	if (!node_online(nid)) {
	291	pgdat = hotadd_new_pgdat(nid, start);
	292	if (!pgdat)
	293	return -ENOMEM;
	294	new_pgdat = 1;
9af3c2de YG	295	}
9af3c2de YG	296
bc02af93 YG	297	/* call arch's memory hotadd */
	298	ret = arch_add_memory(nid, start, size);
	299
9af3c2de YG	300	if (ret < 0)
	301	goto error;
	302
0fc44159	303	/* we online node here. we can't roll back from here. */
9af3c2de YG	304	node_set_online(nid);
9af3c2de YG	305
38837fc7 PJ	306	cpuset_track_online_nodes();
38837fc7 PJ	307
0fc44159 YG	308	if (new_pgdat) {
	309	ret = register_one_node(nid);
	310	/*
	311	* If sysfs file of new node can't create, cpu on the node
	312	* can't be hot-added. There is no rollback way now.
	313	* So, check by BUG_ON() to catch it reluctantly..
	314	*/
	315	BUG_ON(ret);
	316	}
	317
9af3c2de YG	318	return ret;
	319	error:
	320	/* rollback pgdat allocation and others */
	321	if (new_pgdat)
	322	rollback_node_hotadd(nid, pgdat);
ebd15302 KH	323	if (res)
ebd15302 KH	324	release_memory_resource(res);
9af3c2de	325
bc02af93 YG	326	return ret;
	327	}
	328	EXPORT_SYMBOL_GPL(add_memory);
0c0e6195 KH	329
	330	#ifdef CONFIG_MEMORY_HOTREMOVE
	331	/*
	332	* Confirm all pages in a range [start, end) is belongs to the same zone.
	333	*/
	334	static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
	335	{
	336	unsigned long pfn;
	337	struct zone *zone = NULL;
	338	struct page *page;
	339	int i;
	340	for (pfn = start_pfn;
	341	pfn < end_pfn;
	342	pfn += MAX_ORDER_NR_PAGES) {
	343	i = 0;
	344	/* This is just a CONFIG_HOLES_IN_ZONE check.*/
	345	while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
	346	i++;
	347	if (i == MAX_ORDER_NR_PAGES)
	348	continue;
	349	page = pfn_to_page(pfn + i);
	350	if (zone && page_zone(page) != zone)
	351	return 0;
	352	zone = page_zone(page);
	353	}
	354	return 1;
	355	}
	356
	357	/*
	358	* Scanning pfn is much easier than scanning lru list.
	359	* Scan pfn from start to end and Find LRU page.
	360	*/
	361	int scan_lru_pages(unsigned long start, unsigned long end)
	362	{
	363	unsigned long pfn;
	364	struct page *page;
	365	for (pfn = start; pfn < end; pfn++) {
	366	if (pfn_valid(pfn)) {
	367	page = pfn_to_page(pfn);
	368	if (PageLRU(page))
	369	return pfn;
	370	}
	371	}
	372	return 0;
	373	}
	374
	375	static struct page *
	376	hotremove_migrate_alloc(struct page *page,
	377	unsigned long private,
	378	int **x)
	379	{
	380	/* This should be improoooooved!! */
	381	return alloc_page(GFP_HIGHUSER_PAGECACHE);
	382	}
	383
	384
	385	#define NR_OFFLINE_AT_ONCE_PAGES (256)
	386	static int
	387	do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
	388	{
	389	unsigned long pfn;
	390	struct page *page;
	391	int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
	392	int not_managed = 0;
393	int ret = 0;
394	LIST_HEAD(source);
395
396	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
397	if (!pfn_valid(pfn))
398	continue;
399	page = pfn_to_page(pfn);
400	if (!page_count(page))
401	continue;
402	/*
403	* We can skip free pages. And we can only deal with pages on
404	* LRU.
405	*/
406	ret = isolate_lru_page(page, &source);
407	if (!ret) { /* Success */
408	move_pages--;
409	} else {
410	/* Becasue we don't have big zone->lock. we should
411	check this again here. */
412	if (page_count(page))
413	not_managed++;
414	#ifdef CONFIG_DEBUG_VM
415	printk(KERN_INFO "removing from LRU failed"
416	" %lx/%d/%lx\n",
417	pfn, page_count(page), page->flags);
418	#endif
419	}
420	}
421	ret = -EBUSY;
422	if (not_managed) {
423	if (!list_empty(&source))
424	putback_lru_pages(&source);
425	goto out;
426	}
427	ret = 0;
428	if (list_empty(&source))
429	goto out;
430	/* this function returns # of failed pages */
431	ret = migrate_pages(&source, hotremove_migrate_alloc, 0);
432
433	out:
434	return ret;
435	}
436
437	/*
438	* remove from free_area[] and mark all as Reserved.
439	*/
440	static int
441	offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
442	void *data)
443	{
444	__offline_isolated_pages(start, start + nr_pages);
445	return 0;
446	}
447
448	static void
449	offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
450	{
451	walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL,
452	offline_isolated_pages_cb);
453	}
454
455	/*
456	* Check all pages in range, recoreded as memory resource, are isolated.
457	*/
458	static int
459	check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
460	void *data)
461	{
462	int ret;
463	long offlined = (long )data;
464	ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
465	offlined = nr_pages;
466	if (!ret)
467	(long )data += offlined;
468	return ret;
469	}
470
471	static long
472	check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
473	{
474	long offlined = 0;
475	int ret;
476
477	ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined,
478	check_pages_isolated_cb);
479	if (ret < 0)
480	offlined = (long)ret;
481	return offlined;
482	}
483
484	extern void drain_all_local_pages(void);
485
486	int offline_pages(unsigned long start_pfn,
487	unsigned long end_pfn, unsigned long timeout)
488	{
489	unsigned long pfn, nr_pages, expire;
490	long offlined_pages;
7b78d335	491	int ret, drain, retry_max, node;
0c0e6195	492	struct zone *zone;
7b78d335	493	struct memory_notify arg;
0c0e6195 KH	494
	495	BUG_ON(start_pfn >= end_pfn);
	496	/* at least, alignment against pageblock is necessary */
	497	if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
	498	return -EINVAL;
	499	if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
	500	return -EINVAL;
	501	/* This makes hotplug much easier...and readable.
	502	we assume this for now. .*/
	503	if (!test_pages_in_a_zone(start_pfn, end_pfn))
	504	return -EINVAL;
7b78d335 YG	505
	506	zone = page_zone(pfn_to_page(start_pfn));
	507	node = zone_to_nid(zone);
	508	nr_pages = end_pfn - start_pfn;
	509
0c0e6195 KH	510	/* set above range as isolated */
	511	ret = start_isolate_page_range(start_pfn, end_pfn);
	512	if (ret)
	513	return ret;
7b78d335 YG	514
	515	arg.start_pfn = start_pfn;
	516	arg.nr_pages = nr_pages;
	517	arg.status_change_nid = -1;
	518	if (nr_pages >= node_present_pages(node))
	519	arg.status_change_nid = node;
	520
	521	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
	522	ret = notifier_to_errno(ret);
	523	if (ret)
	524	goto failed_removal;
	525
0c0e6195 KH	526	pfn = start_pfn;
	527	expire = jiffies + timeout;
	528	drain = 0;
	529	retry_max = 5;
	530	repeat:
	531	/* start memory hot removal */
	532	ret = -EAGAIN;
	533	if (time_after(jiffies, expire))
	534	goto failed_removal;
	535	ret = -EINTR;
	536	if (signal_pending(current))
	537	goto failed_removal;
	538	ret = 0;
	539	if (drain) {
	540	lru_add_drain_all();
	541	flush_scheduled_work();
	542	cond_resched();
	543	drain_all_local_pages();
	544	}
	545
	546	pfn = scan_lru_pages(start_pfn, end_pfn);
	547	if (pfn) { /* We have page on LRU */
	548	ret = do_migrate_range(pfn, end_pfn);
	549	if (!ret) {
	550	drain = 1;
	551	goto repeat;
	552	} else {
	553	if (ret < 0)
	554	if (--retry_max == 0)
	555	goto failed_removal;
	556	yield();
	557	drain = 1;
	558	goto repeat;
	559	}
	560	}
	561	/* drain all zone's lru pagevec, this is asyncronous... */
	562	lru_add_drain_all();
	563	flush_scheduled_work();
	564	yield();
	565	/* drain pcp pages , this is synchrouns. */
	566	drain_all_local_pages();
	567	/* check again */
	568	offlined_pages = check_pages_isolated(start_pfn, end_pfn);
	569	if (offlined_pages < 0) {
	570	ret = -EBUSY;
	571	goto failed_removal;
	572	}
	573	printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
	574	/* Ok, all of our target is islaoted.
	575	We cannot do rollback at this point. */
	576	offline_isolated_pages(start_pfn, end_pfn);
dbc0e4ce KH	577	/* reset pagetype flags and makes migrate type to be MOVABLE */
dbc0e4ce KH	578	undo_isolate_page_range(start_pfn, end_pfn);
0c0e6195	579	/* removal success */
0c0e6195 KH	580	zone->present_pages -= offlined_pages;
	581	zone->zone_pgdat->node_present_pages -= offlined_pages;
	582	totalram_pages -= offlined_pages;
	583	num_physpages -= offlined_pages;
7b78d335	584
0c0e6195 KH	585	vm_total_pages = nr_free_pagecache_pages();
0c0e6195 KH	586	writeback_set_ratelimit();
7b78d335 YG	587
7b78d335 YG	588	memory_notify(MEM_OFFLINE, &arg);
0c0e6195 KH	589	return 0;
	590
	591	failed_removal:
	592	printk(KERN_INFO "memory offlining %lx to %lx failed\n",
	593	start_pfn, end_pfn);
7b78d335	594	memory_notify(MEM_CANCEL_OFFLINE, &arg);
0c0e6195 KH	595	/* pushback to free area */
0c0e6195 KH	596	undo_isolate_page_range(start_pfn, end_pfn);
7b78d335	597
0c0e6195 KH	598	return ret;
0c0e6195 KH	599	}
48e94196 KH	600	#else
	601	int remove_memory(u64 start, u64 size)
	602	{
	603	return -EINVAL;
	604	}
	605	EXPORT_SYMBOL_GPL(remove_memory);
0c0e6195	606	#endif /* CONFIG_MEMORY_HOTREMOVE */