[net-next-2.6.git] / mm / memory_hotplug.c

/*
 *  linux/mm/memory_hotplug.c
 *
 *  Copyright (C)
 */

#include <linux/stddef.h>
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/module.h>
#include <linux/pagevec.h>
#include <linux/writeback.h>
#include <linux/slab.h>
#include <linux/sysctl.h>
#include <linux/cpu.h>
#include <linux/memory.h>
#include <linux/memory_hotplug.h>
#include <linux/highmem.h>
#include <linux/vmalloc.h>
#include <linux/ioport.h>
#include <linux/cpuset.h>
#include <linux/delay.h>
#include <linux/migrate.h>
#include <linux/page-isolation.h>

#include <asm/tlbflush.h>

/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
	struct resource *res;
	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
	BUG_ON(!res);

	res->name = "System RAM";
	res->start = start;
	res->end = start + size - 1;
	res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
	if (request_resource(&iomem_resource, res) < 0) {
		printk("System RAM resource %llx - %llx cannot be added\n",
		(unsigned long long)res->start, (unsigned long long)res->end);
		kfree(res);
		res = NULL;
	}
	return res;
}

static void release_memory_resource(struct resource *res)
{
	if (!res)
		return;
	release_resource(res);
	kfree(res);
	return;
}


#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
{
	struct pglist_data *pgdat = zone->zone_pgdat;
	int nr_pages = PAGES_PER_SECTION;
	int nid = pgdat->node_id;
	int zone_type;

	zone_type = zone - pgdat->node_zones;
	if (!zone->wait_table) {
		int ret = 0;
		ret = init_currently_empty_zone(zone, phys_start_pfn,
						nr_pages, MEMMAP_HOTPLUG);
		if (ret < 0)
			return ret;
	}
	memmap_init_zone(nr_pages, nid, zone_type,
			 phys_start_pfn, MEMMAP_HOTPLUG);
	return 0;
}

static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
{
	int nr_pages = PAGES_PER_SECTION;
	int ret;

	if (pfn_valid(phys_start_pfn))
		return -EEXIST;

	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);

	if (ret < 0)
		return ret;

	ret = __add_zone(zone, phys_start_pfn);

	if (ret < 0)
		return ret;

	return register_new_memory(__pfn_to_section(phys_start_pfn));
}

static int __remove_section(struct zone *zone, struct mem_section *ms)
{
	unsigned long flags;
	struct pglist_data *pgdat = zone->zone_pgdat;
	int ret = -EINVAL;

	if (!valid_section(ms))
		return ret;

	ret = unregister_memory_section(ms);
	if (ret)
		return ret;

	pgdat_resize_lock(pgdat, &flags);
	sparse_remove_one_section(zone, ms);
	pgdat_resize_unlock(pgdat, &flags);
	return 0;
}

/*
 * Reasonably generic function for adding memory.  It is
 * expected that archs that support memory hotplug will
 * call this function after deciding the zone to which to
 * add the new pages.
 */
int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
		 unsigned long nr_pages)
{
	unsigned long i;
	int err = 0;
	int start_sec, end_sec;
	/* during initialize mem_map, align hot-added range to section */
	start_sec = pfn_to_section_nr(phys_start_pfn);
	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);

	for (i = start_sec; i <= end_sec; i++) {
		err = __add_section(zone, i << PFN_SECTION_SHIFT);

		/*
		 * EEXIST is finally dealt with by ioresource collision
		 * check. see add_memory() => register_memory_resource()
		 * Warning will be printed if there is collision.
		 */
		if (err && (err != -EEXIST))
			break;
		err = 0;
	}

	return err;
}
EXPORT_SYMBOL_GPL(__add_pages);

/**
 * __remove_pages() - remove sections of pages from a zone
 * @zone: zone from which pages need to be removed
 * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
 * @nr_pages: number of pages to remove (must be multiple of section size)
 *
 * Generic helper function to remove section mappings and sysfs entries
 * for the section of the memory we are removing. Caller needs to make
 * sure that pages are marked reserved and zones are adjust properly by
 * calling offline_pages().
 */
int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
		 unsigned long nr_pages)
{
	unsigned long i, ret = 0;
	int sections_to_remove;

	/*
	 * We can only remove entire sections
	 */
	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
	BUG_ON(nr_pages % PAGES_PER_SECTION);

	release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);

	sections_to_remove = nr_pages / PAGES_PER_SECTION;
	for (i = 0; i < sections_to_remove; i++) {
		unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
		ret = __remove_section(zone, __pfn_to_section(pfn));
		if (ret)
			break;
	}
	return ret;
}
EXPORT_SYMBOL_GPL(__remove_pages);

static void grow_zone_span(struct zone *zone,
		unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long old_zone_end_pfn;

	zone_span_writelock(zone);

	old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
	if (start_pfn < zone->zone_start_pfn)
		zone->zone_start_pfn = start_pfn;

	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
				zone->zone_start_pfn;

	zone_span_writeunlock(zone);
}

static void grow_pgdat_span(struct pglist_data *pgdat,
		unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long old_pgdat_end_pfn =
		pgdat->node_start_pfn + pgdat->node_spanned_pages;

	if (start_pfn < pgdat->node_start_pfn)
		pgdat->node_start_pfn = start_pfn;

	pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
					pgdat->node_start_pfn;
}

static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
			void *arg)
{
	unsigned long i;
	unsigned long onlined_pages = *(unsigned long *)arg;
	struct page *page;
	if (PageReserved(pfn_to_page(start_pfn)))
		for (i = 0; i < nr_pages; i++) {
			page = pfn_to_page(start_pfn + i);
			online_page(page);
			onlined_pages++;
		}
	*(unsigned long *)arg = onlined_pages;
	return 0;
}


int online_pages(unsigned long pfn, unsigned long nr_pages)
{
	unsigned long flags;
	unsigned long onlined_pages = 0;
	struct zone *zone;
	int need_zonelists_rebuild = 0;
	int nid;
	int ret;
	struct memory_notify arg;

	arg.start_pfn = pfn;
	arg.nr_pages = nr_pages;
	arg.status_change_nid = -1;

	nid = page_to_nid(pfn_to_page(pfn));
	if (node_present_pages(nid) == 0)
		arg.status_change_nid = nid;

	ret = memory_notify(MEM_GOING_ONLINE, &arg);
	ret = notifier_to_errno(ret);
	if (ret) {
		memory_notify(MEM_CANCEL_ONLINE, &arg);
		return ret;
	}
	/*
	 * This doesn't need a lock to do pfn_to_page().
	 * The section can't be removed here because of the
	 * memory_block->state_mutex.
	 */
	zone = page_zone(pfn_to_page(pfn));
	pgdat_resize_lock(zone->zone_pgdat, &flags);
	grow_zone_span(zone, pfn, pfn + nr_pages);
	grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
	pgdat_resize_unlock(zone->zone_pgdat, &flags);

	/*
	 * If this zone is not populated, then it is not in zonelist.
	 * This means the page allocator ignores this zone.
	 * So, zonelist must be updated after online.
	 */
	if (!populated_zone(zone))
		need_zonelists_rebuild = 1;

	walk_memory_resource(pfn, nr_pages, &onlined_pages,
		online_pages_range);
	zone->present_pages += onlined_pages;
	zone->zone_pgdat->node_present_pages += onlined_pages;

	setup_per_zone_pages_min();
	if (onlined_pages) {
		kswapd_run(zone_to_nid(zone));
		node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
	}

	if (need_zonelists_rebuild)
		build_all_zonelists();
	vm_total_pages = nr_free_pagecache_pages();
	writeback_set_ratelimit();

	if (onlined_pages)
		memory_notify(MEM_ONLINE, &arg);

	return 0;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */

static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
{
	struct pglist_data *pgdat;
	unsigned long zones_size[MAX_NR_ZONES] = {0};
	unsigned long zholes_size[MAX_NR_ZONES] = {0};
	unsigned long start_pfn = start >> PAGE_SHIFT;

	pgdat = arch_alloc_nodedata(nid);
	if (!pgdat)
		return NULL;

	arch_refresh_nodedata(nid, pgdat);

	/* we can use NODE_DATA(nid) from here */

	/* init node's zones as empty zones, we don't have any present pages.*/
	free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);

	return pgdat;
}

static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
{
	arch_refresh_nodedata(nid, NULL);
	arch_free_nodedata(pgdat);
	return;
}


int add_memory(int nid, u64 start, u64 size)
{
	pg_data_t *pgdat = NULL;
	int new_pgdat = 0;
	struct resource *res;
	int ret;

	res = register_memory_resource(start, size);
	if (!res)
		return -EEXIST;

	if (!node_online(nid)) {
		pgdat = hotadd_new_pgdat(nid, start);
		if (!pgdat)
			return -ENOMEM;
		new_pgdat = 1;
	}

	/* call arch's memory hotadd */
	ret = arch_add_memory(nid, start, size);

	if (ret < 0)
		goto error;

	/* we online node here. we can't roll back from here. */
	node_set_online(nid);

	cpuset_track_online_nodes();

	if (new_pgdat) {
		ret = register_one_node(nid);
		/*
		 * If sysfs file of new node can't create, cpu on the node
		 * can't be hot-added. There is no rollback way now.
		 * So, check by BUG_ON() to catch it reluctantly..
		 */
		BUG_ON(ret);
	}

	return ret;
error:
	/* rollback pgdat allocation and others */
	if (new_pgdat)
		rollback_node_hotadd(nid, pgdat);
	if (res)
		release_memory_resource(res);

	return ret;
}
EXPORT_SYMBOL_GPL(add_memory);

#ifdef CONFIG_MEMORY_HOTREMOVE
/*
 * Confirm all pages in a range [start, end) is belongs to the same zone.
 */
static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;
	struct zone *zone = NULL;
	struct page *page;
	int i;
	for (pfn = start_pfn;
	     pfn < end_pfn;
	     pfn += MAX_ORDER_NR_PAGES) {
		i = 0;
		/* This is just a CONFIG_HOLES_IN_ZONE check.*/
		while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
			i++;
		if (i == MAX_ORDER_NR_PAGES)
			continue;
		page = pfn_to_page(pfn + i);
		if (zone && page_zone(page) != zone)
			return 0;
		zone = page_zone(page);
	}
	return 1;
}

/*
 * Scanning pfn is much easier than scanning lru list.
 * Scan pfn from start to end and Find LRU page.
 */
int scan_lru_pages(unsigned long start, unsigned long end)
{
	unsigned long pfn;
	struct page *page;
	for (pfn = start; pfn < end; pfn++) {
		if (pfn_valid(pfn)) {
			page = pfn_to_page(pfn);
			if (PageLRU(page))
				return pfn;
		}
	}
	return 0;
}

static struct page *
hotremove_migrate_alloc(struct page *page,
			unsigned long private,
			int **x)
{
	/* This should be improoooooved!! */
	return alloc_page(GFP_HIGHUSER_PAGECACHE);
}


#define NR_OFFLINE_AT_ONCE_PAGES	(256)
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
	unsigned long pfn;
	struct page *page;
	int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
	int not_managed = 0;
	int ret = 0;
	LIST_HEAD(source);

	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
		if (!pfn_valid(pfn))
			continue;
		page = pfn_to_page(pfn);
		if (!page_count(page))
			continue;
		/*
		 * We can skip free pages. And we can only deal with pages on
		 * LRU.
		 */
		ret = isolate_lru_page(page, &source);
		if (!ret) { /* Success */
			move_pages--;
		} else {
			/* Becasue we don't have big zone->lock. we should
			   check this again here. */
			if (page_count(page))
				not_managed++;
#ifdef CONFIG_DEBUG_VM
			printk(KERN_INFO "removing from LRU failed"
					 " %lx/%d/%lx\n",
				pfn, page_count(page), page->flags);
#endif
		}
	}
	ret = -EBUSY;
	if (not_managed) {
		if (!list_empty(&source))
			putback_lru_pages(&source);
		goto out;
	}
	ret = 0;
	if (list_empty(&source))
		goto out;
	/* this function returns # of failed pages */
	ret = migrate_pages(&source, hotremove_migrate_alloc, 0);

out:
	return ret;
}

/*
 * remove from free_area[] and mark all as Reserved.
 */
static int
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
			void *data)
{
	__offline_isolated_pages(start, start + nr_pages);
	return 0;
}

static void
offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
	walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL,
				offline_isolated_pages_cb);
}

/*
 * Check all pages in range, recoreded as memory resource, are isolated.
 */
static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
			void *data)
{
	int ret;
	long offlined = *(long *)data;
	ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
	offlined = nr_pages;
	if (!ret)
		*(long *)data += offlined;
	return ret;
}

static long
check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
{
	long offlined = 0;
	int ret;

	ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined,
			check_pages_isolated_cb);
	if (ret < 0)
		offlined = (long)ret;
	return offlined;
}

int offline_pages(unsigned long start_pfn,
		  unsigned long end_pfn, unsigned long timeout)
{
	unsigned long pfn, nr_pages, expire;
	long offlined_pages;
	int ret, drain, retry_max, node;
	struct zone *zone;
	struct memory_notify arg;

	BUG_ON(start_pfn >= end_pfn);
	/* at least, alignment against pageblock is necessary */
	if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
		return -EINVAL;
	if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
		return -EINVAL;
	/* This makes hotplug much easier...and readable.
	   we assume this for now. .*/
	if (!test_pages_in_a_zone(start_pfn, end_pfn))
		return -EINVAL;

	zone = page_zone(pfn_to_page(start_pfn));
	node = zone_to_nid(zone);
	nr_pages = end_pfn - start_pfn;

	/* set above range as isolated */
	ret = start_isolate_page_range(start_pfn, end_pfn);
	if (ret)
		return ret;

	arg.start_pfn = start_pfn;
	arg.nr_pages = nr_pages;
	arg.status_change_nid = -1;
	if (nr_pages >= node_present_pages(node))
		arg.status_change_nid = node;

	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
	ret = notifier_to_errno(ret);
	if (ret)
		goto failed_removal;

	pfn = start_pfn;
	expire = jiffies + timeout;
	drain = 0;
	retry_max = 5;
repeat:
	/* start memory hot removal */
	ret = -EAGAIN;
	if (time_after(jiffies, expire))
		goto failed_removal;
	ret = -EINTR;
	if (signal_pending(current))
		goto failed_removal;
	ret = 0;
	if (drain) {
		lru_add_drain_all();
		flush_scheduled_work();
		cond_resched();
		drain_all_pages();
	}

	pfn = scan_lru_pages(start_pfn, end_pfn);
	if (pfn) { /* We have page on LRU */
		ret = do_migrate_range(pfn, end_pfn);
		if (!ret) {
			drain = 1;
			goto repeat;
		} else {
			if (ret < 0)
				if (--retry_max == 0)
					goto failed_removal;
			yield();
			drain = 1;
			goto repeat;
		}
	}
	/* drain all zone's lru pagevec, this is asyncronous... */
	lru_add_drain_all();
	flush_scheduled_work();
	yield();
	/* drain pcp pages , this is synchrouns. */
	drain_all_pages();
	/* check again */
	offlined_pages = check_pages_isolated(start_pfn, end_pfn);
	if (offlined_pages < 0) {
		ret = -EBUSY;
		goto failed_removal;
	}
	printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
	/* Ok, all of our target is islaoted.
	   We cannot do rollback at this point. */
	offline_isolated_pages(start_pfn, end_pfn);
	/* reset pagetype flags and makes migrate type to be MOVABLE */
	undo_isolate_page_range(start_pfn, end_pfn);
	/* removal success */
	zone->present_pages -= offlined_pages;
	zone->zone_pgdat->node_present_pages -= offlined_pages;
	totalram_pages -= offlined_pages;
	num_physpages -= offlined_pages;

	vm_total_pages = nr_free_pagecache_pages();
	writeback_set_ratelimit();

	memory_notify(MEM_OFFLINE, &arg);
	return 0;

failed_removal:
	printk(KERN_INFO "memory offlining %lx to %lx failed\n",
		start_pfn, end_pfn);
	memory_notify(MEM_CANCEL_OFFLINE, &arg);
	/* pushback to free area */
	undo_isolate_page_range(start_pfn, end_pfn);

	return ret;
}
#else
int remove_memory(u64 start, u64 size)
{
	return -EINVAL;
}
EXPORT_SYMBOL_GPL(remove_memory);
#endif /* CONFIG_MEMORY_HOTREMOVE */
Commit	Line	Data
3947be19 DH	1	/*
	2	* linux/mm/memory_hotplug.c
	3	*
	4	* Copyright (C)
	5	*/
	6
3947be19 DH	7	#include <linux/stddef.h>
	8	#include <linux/mm.h>
	9	#include <linux/swap.h>
	10	#include <linux/interrupt.h>
	11	#include <linux/pagemap.h>
	12	#include <linux/bootmem.h>
	13	#include <linux/compiler.h>
	14	#include <linux/module.h>
	15	#include <linux/pagevec.h>
2d1d43f6	16	#include <linux/writeback.h>
3947be19 DH	17	#include <linux/slab.h>
	18	#include <linux/sysctl.h>
	19	#include <linux/cpu.h>
	20	#include <linux/memory.h>
	21	#include <linux/memory_hotplug.h>
	22	#include <linux/highmem.h>
	23	#include <linux/vmalloc.h>
0a547039	24	#include <linux/ioport.h>
38837fc7	25	#include <linux/cpuset.h>
0c0e6195 KH	26	#include <linux/delay.h>
	27	#include <linux/migrate.h>
	28	#include <linux/page-isolation.h>
3947be19 DH	29
	30	#include <asm/tlbflush.h>
	31
45e0b78b KM	32	/* add this memory to iomem resource */
	33	static struct resource *register_memory_resource(u64 start, u64 size)
	34	{
	35	struct resource *res;
	36	res = kzalloc(sizeof(struct resource), GFP_KERNEL);
	37	BUG_ON(!res);
	38
	39	res->name = "System RAM";
	40	res->start = start;
	41	res->end = start + size - 1;
887c3cb1	42	res->flags = IORESOURCE_MEM \| IORESOURCE_BUSY;
45e0b78b KM	43	if (request_resource(&iomem_resource, res) < 0) {
	44	printk("System RAM resource %llx - %llx cannot be added\n",
	45	(unsigned long long)res->start, (unsigned long long)res->end);
	46	kfree(res);
	47	res = NULL;
	48	}
	49	return res;
	50	}
	51
	52	static void release_memory_resource(struct resource *res)
	53	{
	54	if (!res)
	55	return;
	56	release_resource(res);
	57	kfree(res);
	58	return;
	59	}
	60
	61
53947027	62	#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
718127cc	63	static int __add_zone(struct zone *zone, unsigned long phys_start_pfn)
3947be19 DH	64	{
	65	struct pglist_data *pgdat = zone->zone_pgdat;
	66	int nr_pages = PAGES_PER_SECTION;
	67	int nid = pgdat->node_id;
	68	int zone_type;
	69
	70	zone_type = zone - pgdat->node_zones;
13466c84	71	if (!zone->wait_table) {
718127cc	72	int ret = 0;
a2f3aa02 DH	73	ret = init_currently_empty_zone(zone, phys_start_pfn,
a2f3aa02 DH	74	nr_pages, MEMMAP_HOTPLUG);
718127cc YG	75	if (ret < 0)
	76	return ret;
	77	}
a2f3aa02 DH	78	memmap_init_zone(nr_pages, nid, zone_type,
a2f3aa02 DH	79	phys_start_pfn, MEMMAP_HOTPLUG);
718127cc	80	return 0;
3947be19 DH	81	}
3947be19 DH	82
3947be19 DH	83	static int __add_section(struct zone *zone, unsigned long phys_start_pfn)
3947be19 DH	84	{
3947be19	85	int nr_pages = PAGES_PER_SECTION;
3947be19 DH	86	int ret;
3947be19 DH	87
ebd15302 KH	88	if (pfn_valid(phys_start_pfn))
	89	return -EEXIST;
	90
0b0acbec	91	ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
3947be19 DH	92
	93	if (ret < 0)
	94	return ret;
	95
718127cc YG	96	ret = __add_zone(zone, phys_start_pfn);
	97
	98	if (ret < 0)
	99	return ret;
	100
3947be19 DH	101	return register_new_memory(__pfn_to_section(phys_start_pfn));
	102	}
	103
ea01ea93 BP	104	static int __remove_section(struct zone zone, struct mem_section ms)
	105	{
	106	unsigned long flags;
	107	struct pglist_data *pgdat = zone->zone_pgdat;
	108	int ret = -EINVAL;
	109
	110	if (!valid_section(ms))
	111	return ret;
	112
	113	ret = unregister_memory_section(ms);
	114	if (ret)
	115	return ret;
	116
	117	pgdat_resize_lock(pgdat, &flags);
	118	sparse_remove_one_section(zone, ms);
	119	pgdat_resize_unlock(pgdat, &flags);
	120	return 0;
	121	}
	122
3947be19 DH	123	/*
	124	* Reasonably generic function for adding memory. It is
	125	* expected that archs that support memory hotplug will
	126	* call this function after deciding the zone to which to
	127	* add the new pages.
	128	*/
	129	int __add_pages(struct zone *zone, unsigned long phys_start_pfn,
	130	unsigned long nr_pages)
	131	{
	132	unsigned long i;
	133	int err = 0;
6f712711 KH	134	int start_sec, end_sec;
	135	/* during initialize mem_map, align hot-added range to section */
	136	start_sec = pfn_to_section_nr(phys_start_pfn);
	137	end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
3947be19	138
6f712711 KH	139	for (i = start_sec; i <= end_sec; i++) {
6f712711 KH	140	err = __add_section(zone, i << PFN_SECTION_SHIFT);
3947be19	141
6f712711	142	/*
183ff22b	143	* EEXIST is finally dealt with by ioresource collision
6f712711 KH	144	* check. see add_memory() => register_memory_resource()
6f712711 KH	145	* Warning will be printed if there is collision.
bed120c6 JS	146	*/
bed120c6 JS	147	if (err && (err != -EEXIST))
3947be19	148	break;
6f712711	149	err = 0;
3947be19 DH	150	}
	151
	152	return err;
	153	}
bed120c6	154	EXPORT_SYMBOL_GPL(__add_pages);
3947be19	155
ea01ea93 BP	156	/**
	157	* __remove_pages() - remove sections of pages from a zone
	158	* @zone: zone from which pages need to be removed
	159	* @phys_start_pfn: starting pageframe (must be aligned to start of a section)
	160	* @nr_pages: number of pages to remove (must be multiple of section size)
	161	*
	162	* Generic helper function to remove section mappings and sysfs entries
	163	* for the section of the memory we are removing. Caller needs to make
	164	* sure that pages are marked reserved and zones are adjust properly by
	165	* calling offline_pages().
	166	*/
	167	int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
	168	unsigned long nr_pages)
	169	{
	170	unsigned long i, ret = 0;
	171	int sections_to_remove;
	172
	173	/*
	174	* We can only remove entire sections
	175	*/
	176	BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
	177	BUG_ON(nr_pages % PAGES_PER_SECTION);
	178
	179	release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
	180
	181	sections_to_remove = nr_pages / PAGES_PER_SECTION;
	182	for (i = 0; i < sections_to_remove; i++) {
	183	unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
	184	ret = __remove_section(zone, __pfn_to_section(pfn));
	185	if (ret)
	186	break;
	187	}
	188	return ret;
	189	}
	190	EXPORT_SYMBOL_GPL(__remove_pages);
	191
3947be19 DH	192	static void grow_zone_span(struct zone *zone,
	193	unsigned long start_pfn, unsigned long end_pfn)
	194	{
	195	unsigned long old_zone_end_pfn;
	196
	197	zone_span_writelock(zone);
	198
	199	old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
	200	if (start_pfn < zone->zone_start_pfn)
	201	zone->zone_start_pfn = start_pfn;
	202
25a6df95 YG	203	zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
25a6df95 YG	204	zone->zone_start_pfn;
3947be19 DH	205
	206	zone_span_writeunlock(zone);
	207	}
	208
	209	static void grow_pgdat_span(struct pglist_data *pgdat,
	210	unsigned long start_pfn, unsigned long end_pfn)
	211	{
	212	unsigned long old_pgdat_end_pfn =
	213	pgdat->node_start_pfn + pgdat->node_spanned_pages;
	214
	215	if (start_pfn < pgdat->node_start_pfn)
	216	pgdat->node_start_pfn = start_pfn;
	217
25a6df95 YG	218	pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
25a6df95 YG	219	pgdat->node_start_pfn;
3947be19 DH	220	}
3947be19 DH	221
75884fb1 KH	222	static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
75884fb1 KH	223	void *arg)
3947be19 DH	224	{
3947be19 DH	225	unsigned long i;
75884fb1 KH	226	unsigned long onlined_pages = (unsigned long )arg;
	227	struct page *page;
	228	if (PageReserved(pfn_to_page(start_pfn)))
	229	for (i = 0; i < nr_pages; i++) {
	230	page = pfn_to_page(start_pfn + i);
	231	online_page(page);
	232	onlined_pages++;
	233	}
	234	(unsigned long )arg = onlined_pages;
	235	return 0;
	236	}
	237
	238
	239	int online_pages(unsigned long pfn, unsigned long nr_pages)
	240	{
3947be19 DH	241	unsigned long flags;
	242	unsigned long onlined_pages = 0;
	243	struct zone *zone;
6811378e	244	int need_zonelists_rebuild = 0;
7b78d335 YG	245	int nid;
	246	int ret;
	247	struct memory_notify arg;
	248
	249	arg.start_pfn = pfn;
	250	arg.nr_pages = nr_pages;
	251	arg.status_change_nid = -1;
	252
	253	nid = page_to_nid(pfn_to_page(pfn));
	254	if (node_present_pages(nid) == 0)
	255	arg.status_change_nid = nid;
3947be19	256
7b78d335 YG	257	ret = memory_notify(MEM_GOING_ONLINE, &arg);
	258	ret = notifier_to_errno(ret);
	259	if (ret) {
	260	memory_notify(MEM_CANCEL_ONLINE, &arg);
	261	return ret;
	262	}
3947be19 DH	263	/*
	264	* This doesn't need a lock to do pfn_to_page().
	265	* The section can't be removed here because of the
da19cbcf	266	* memory_block->state_mutex.
3947be19 DH	267	*/
	268	zone = page_zone(pfn_to_page(pfn));
	269	pgdat_resize_lock(zone->zone_pgdat, &flags);
	270	grow_zone_span(zone, pfn, pfn + nr_pages);
	271	grow_pgdat_span(zone->zone_pgdat, pfn, pfn + nr_pages);
	272	pgdat_resize_unlock(zone->zone_pgdat, &flags);
	273
6811378e YG	274	/*
	275	* If this zone is not populated, then it is not in zonelist.
	276	* This means the page allocator ignores this zone.
	277	* So, zonelist must be updated after online.
	278	*/
	279	if (!populated_zone(zone))
	280	need_zonelists_rebuild = 1;
	281
75884fb1 KH	282	walk_memory_resource(pfn, nr_pages, &onlined_pages,
75884fb1 KH	283	online_pages_range);
3947be19	284	zone->present_pages += onlined_pages;
f2937be5	285	zone->zone_pgdat->node_present_pages += onlined_pages;
3947be19	286
61b13993	287	setup_per_zone_pages_min();
7ea1530a CL	288	if (onlined_pages) {
	289	kswapd_run(zone_to_nid(zone));
	290	node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
	291	}
61b13993	292
6811378e YG	293	if (need_zonelists_rebuild)
6811378e YG	294	build_all_zonelists();
5a4d4361	295	vm_total_pages = nr_free_pagecache_pages();
2d1d43f6	296	writeback_set_ratelimit();
7b78d335 YG	297
	298	if (onlined_pages)
	299	memory_notify(MEM_ONLINE, &arg);
	300
3947be19 DH	301	return 0;
3947be19 DH	302	}
53947027	303	#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
bc02af93	304
9af3c2de YG	305	static pg_data_t *hotadd_new_pgdat(int nid, u64 start)
	306	{
	307	struct pglist_data *pgdat;
	308	unsigned long zones_size[MAX_NR_ZONES] = {0};
	309	unsigned long zholes_size[MAX_NR_ZONES] = {0};
	310	unsigned long start_pfn = start >> PAGE_SHIFT;
	311
	312	pgdat = arch_alloc_nodedata(nid);
	313	if (!pgdat)
	314	return NULL;
	315
	316	arch_refresh_nodedata(nid, pgdat);
	317
	318	/* we can use NODE_DATA(nid) from here */
	319
	320	/* init node's zones as empty zones, we don't have any present pages.*/
	321	free_area_init_node(nid, pgdat, zones_size, start_pfn, zholes_size);
	322
	323	return pgdat;
	324	}
	325
	326	static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
	327	{
	328	arch_refresh_nodedata(nid, NULL);
	329	arch_free_nodedata(pgdat);
	330	return;
	331	}
	332
0a547039	333
bc02af93 YG	334	int add_memory(int nid, u64 start, u64 size)
bc02af93 YG	335	{
9af3c2de YG	336	pg_data_t *pgdat = NULL;
9af3c2de YG	337	int new_pgdat = 0;
ebd15302	338	struct resource *res;
bc02af93 YG	339	int ret;
bc02af93 YG	340
ebd15302 KH	341	res = register_memory_resource(start, size);
	342	if (!res)
	343	return -EEXIST;
	344
9af3c2de YG	345	if (!node_online(nid)) {
	346	pgdat = hotadd_new_pgdat(nid, start);
	347	if (!pgdat)
	348	return -ENOMEM;
	349	new_pgdat = 1;
9af3c2de YG	350	}
9af3c2de YG	351
bc02af93 YG	352	/* call arch's memory hotadd */
	353	ret = arch_add_memory(nid, start, size);
	354
9af3c2de YG	355	if (ret < 0)
	356	goto error;
	357
0fc44159	358	/* we online node here. we can't roll back from here. */
9af3c2de YG	359	node_set_online(nid);
9af3c2de YG	360
38837fc7 PJ	361	cpuset_track_online_nodes();
38837fc7 PJ	362
0fc44159 YG	363	if (new_pgdat) {
	364	ret = register_one_node(nid);
	365	/*
	366	* If sysfs file of new node can't create, cpu on the node
	367	* can't be hot-added. There is no rollback way now.
	368	* So, check by BUG_ON() to catch it reluctantly..
	369	*/
	370	BUG_ON(ret);
	371	}
	372
9af3c2de YG	373	return ret;
	374	error:
	375	/* rollback pgdat allocation and others */
	376	if (new_pgdat)
	377	rollback_node_hotadd(nid, pgdat);
ebd15302 KH	378	if (res)
ebd15302 KH	379	release_memory_resource(res);
9af3c2de	380
bc02af93 YG	381	return ret;
	382	}
	383	EXPORT_SYMBOL_GPL(add_memory);
0c0e6195 KH	384
	385	#ifdef CONFIG_MEMORY_HOTREMOVE
	386	/*
	387	* Confirm all pages in a range [start, end) is belongs to the same zone.
	388	*/
	389	static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
	390	{
	391	unsigned long pfn;
	392	struct zone *zone = NULL;
	393	struct page *page;
	394	int i;
	395	for (pfn = start_pfn;
	396	pfn < end_pfn;
	397	pfn += MAX_ORDER_NR_PAGES) {
	398	i = 0;
	399	/* This is just a CONFIG_HOLES_IN_ZONE check.*/
	400	while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
	401	i++;
	402	if (i == MAX_ORDER_NR_PAGES)
	403	continue;
	404	page = pfn_to_page(pfn + i);
	405	if (zone && page_zone(page) != zone)
	406	return 0;
	407	zone = page_zone(page);
	408	}
	409	return 1;
	410	}
	411
	412	/*
	413	* Scanning pfn is much easier than scanning lru list.
	414	* Scan pfn from start to end and Find LRU page.
	415	*/
	416	int scan_lru_pages(unsigned long start, unsigned long end)
	417	{
	418	unsigned long pfn;
	419	struct page *page;
	420	for (pfn = start; pfn < end; pfn++) {
	421	if (pfn_valid(pfn)) {
	422	page = pfn_to_page(pfn);
	423	if (PageLRU(page))
	424	return pfn;
	425	}
	426	}
	427	return 0;
	428	}
	429
	430	static struct page *
	431	hotremove_migrate_alloc(struct page *page,
	432	unsigned long private,
	433	int **x)
	434	{
	435	/* This should be improoooooved!! */
	436	return alloc_page(GFP_HIGHUSER_PAGECACHE);
	437	}
	438
	439
	440	#define NR_OFFLINE_AT_ONCE_PAGES (256)
	441	static int
	442	do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
	443	{
	444	unsigned long pfn;
	445	struct page *page;
	446	int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
	447	int not_managed = 0;
448	int ret = 0;
449	LIST_HEAD(source);
450
451	for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
452	if (!pfn_valid(pfn))
453	continue;
454	page = pfn_to_page(pfn);
455	if (!page_count(page))
456	continue;
457	/*
458	* We can skip free pages. And we can only deal with pages on
459	* LRU.
460	*/
461	ret = isolate_lru_page(page, &source);
462	if (!ret) { /* Success */
463	move_pages--;
464	} else {
465	/* Becasue we don't have big zone->lock. we should
466	check this again here. */
467	if (page_count(page))
468	not_managed++;
469	#ifdef CONFIG_DEBUG_VM
470	printk(KERN_INFO "removing from LRU failed"
471	" %lx/%d/%lx\n",
472	pfn, page_count(page), page->flags);
473	#endif
474	}
475	}
476	ret = -EBUSY;
477	if (not_managed) {
478	if (!list_empty(&source))
479	putback_lru_pages(&source);
480	goto out;
481	}
482	ret = 0;
483	if (list_empty(&source))
484	goto out;
485	/* this function returns # of failed pages */
486	ret = migrate_pages(&source, hotremove_migrate_alloc, 0);
487
488	out:
489	return ret;
490	}
491
492	/*
493	* remove from free_area[] and mark all as Reserved.
494	*/
495	static int
496	offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
497	void *data)
498	{
499	__offline_isolated_pages(start, start + nr_pages);
500	return 0;
501	}
502
503	static void
504	offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
505	{
506	walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL,
507	offline_isolated_pages_cb);
508	}
509
510	/*
511	* Check all pages in range, recoreded as memory resource, are isolated.
512	*/
513	static int
514	check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
515	void *data)
516	{
517	int ret;
518	long offlined = (long )data;
519	ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
520	offlined = nr_pages;
521	if (!ret)
522	(long )data += offlined;
523	return ret;
524	}
525
526	static long
527	check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
528	{
529	long offlined = 0;
530	int ret;
531
532	ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined,
533	check_pages_isolated_cb);
534	if (ret < 0)
535	offlined = (long)ret;
536	return offlined;
537	}
538
0c0e6195 KH	539	int offline_pages(unsigned long start_pfn,
	540	unsigned long end_pfn, unsigned long timeout)
	541	{
	542	unsigned long pfn, nr_pages, expire;
	543	long offlined_pages;
7b78d335	544	int ret, drain, retry_max, node;
0c0e6195	545	struct zone *zone;
7b78d335	546	struct memory_notify arg;
0c0e6195 KH	547
	548	BUG_ON(start_pfn >= end_pfn);
	549	/* at least, alignment against pageblock is necessary */
	550	if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
	551	return -EINVAL;
	552	if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
	553	return -EINVAL;
	554	/* This makes hotplug much easier...and readable.
	555	we assume this for now. .*/
	556	if (!test_pages_in_a_zone(start_pfn, end_pfn))
	557	return -EINVAL;
7b78d335 YG	558
	559	zone = page_zone(pfn_to_page(start_pfn));
	560	node = zone_to_nid(zone);
	561	nr_pages = end_pfn - start_pfn;
	562
0c0e6195 KH	563	/* set above range as isolated */
	564	ret = start_isolate_page_range(start_pfn, end_pfn);
	565	if (ret)
	566	return ret;
7b78d335 YG	567
	568	arg.start_pfn = start_pfn;
	569	arg.nr_pages = nr_pages;
	570	arg.status_change_nid = -1;
	571	if (nr_pages >= node_present_pages(node))
	572	arg.status_change_nid = node;
	573
	574	ret = memory_notify(MEM_GOING_OFFLINE, &arg);
	575	ret = notifier_to_errno(ret);
	576	if (ret)
	577	goto failed_removal;
	578
0c0e6195 KH	579	pfn = start_pfn;
	580	expire = jiffies + timeout;
	581	drain = 0;
	582	retry_max = 5;
	583	repeat:
	584	/* start memory hot removal */
	585	ret = -EAGAIN;
	586	if (time_after(jiffies, expire))
	587	goto failed_removal;
	588	ret = -EINTR;
	589	if (signal_pending(current))
	590	goto failed_removal;
	591	ret = 0;
	592	if (drain) {
	593	lru_add_drain_all();
	594	flush_scheduled_work();
	595	cond_resched();
9f8f2172	596	drain_all_pages();
0c0e6195 KH	597	}
	598
	599	pfn = scan_lru_pages(start_pfn, end_pfn);
	600	if (pfn) { /* We have page on LRU */
	601	ret = do_migrate_range(pfn, end_pfn);
	602	if (!ret) {
	603	drain = 1;
	604	goto repeat;
	605	} else {
	606	if (ret < 0)
	607	if (--retry_max == 0)
	608	goto failed_removal;
	609	yield();
	610	drain = 1;
	611	goto repeat;
	612	}
	613	}
	614	/* drain all zone's lru pagevec, this is asyncronous... */
	615	lru_add_drain_all();
	616	flush_scheduled_work();
	617	yield();
	618	/* drain pcp pages , this is synchrouns. */
9f8f2172	619	drain_all_pages();
0c0e6195 KH	620	/* check again */
	621	offlined_pages = check_pages_isolated(start_pfn, end_pfn);
	622	if (offlined_pages < 0) {
	623	ret = -EBUSY;
	624	goto failed_removal;
	625	}
	626	printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
	627	/* Ok, all of our target is islaoted.
	628	We cannot do rollback at this point. */
	629	offline_isolated_pages(start_pfn, end_pfn);
dbc0e4ce KH	630	/* reset pagetype flags and makes migrate type to be MOVABLE */
dbc0e4ce KH	631	undo_isolate_page_range(start_pfn, end_pfn);
0c0e6195	632	/* removal success */
0c0e6195 KH	633	zone->present_pages -= offlined_pages;
	634	zone->zone_pgdat->node_present_pages -= offlined_pages;
	635	totalram_pages -= offlined_pages;
	636	num_physpages -= offlined_pages;
7b78d335	637
0c0e6195 KH	638	vm_total_pages = nr_free_pagecache_pages();
0c0e6195 KH	639	writeback_set_ratelimit();
7b78d335 YG	640
7b78d335 YG	641	memory_notify(MEM_OFFLINE, &arg);
0c0e6195 KH	642	return 0;
	643
	644	failed_removal:
	645	printk(KERN_INFO "memory offlining %lx to %lx failed\n",
	646	start_pfn, end_pfn);
7b78d335	647	memory_notify(MEM_CANCEL_OFFLINE, &arg);
0c0e6195 KH	648	/* pushback to free area */
0c0e6195 KH	649	undo_isolate_page_range(start_pfn, end_pfn);
7b78d335	650
0c0e6195 KH	651	return ret;
0c0e6195 KH	652	}
48e94196 KH	653	#else
	654	int remove_memory(u64 start, u64 size)
	655	{
	656	return -EINVAL;
	657	}
	658	EXPORT_SYMBOL_GPL(remove_memory);
0c0e6195	659	#endif /* CONFIG_MEMORY_HOTREMOVE */