* Free Software Foundation.
*
* High level machine check handler. Handles pages reported by the
- * hardware as being corrupted usually due to a 2bit ECC memory or cache
+ * hardware as being corrupted usually due to a multi-bit ECC memory or cache
* failure.
+ *
+ * In addition there is a "soft offline" entry point that allows stop using
+ * not-yet-corrupted-by-suspicious pages without killing anything.
*
* Handles page cache pages in various states. The tricky part
- * here is that we can access any page asynchronous to other VM
- * users, because memory failures could happen anytime and anywhere,
- * possibly violating some of their assumptions. This is why this code
- * has to be extremely careful. Generally it tries to use normal locking
- * rules, as in get the standard locks, even if that means the
- * error handling takes potentially a long time.
- *
- * The operation to map back from RMAP chains to processes has to walk
- * the complete process list and has non linear complexity with the number
- * mappings. In short it can be quite slow. But since memory corruptions
- * are rare we hope to get away with this.
+ * here is that we can access any page asynchronously in respect to
+ * other VM users, because memory failures could happen anytime and
+ * anywhere. This could violate some of their assumptions. This is why
+ * this code has to be extremely careful. Generally it tries to use
+ * normal locking rules, as in get the standard locks, even if that means
+ * the error handling takes potentially a long time.
+ *
+ * There are several operations here with exponential complexity because
+ * of unsuitable VM data structures. For example the operation to map back
+ * from RMAP chains to processes has to walk the complete process list and
+ * has non linear complexity with the number. But since memory corruptions
+ * are rare we hope to get away with this. This avoids impacting the core
+ * VM.
*/
/*
* - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages
* - pass bad pages to kdump next kernel
*/
-#define DEBUG 1 /* remove me in 2.6.34 */
#include <linux/kernel.h>
#include <linux/mm.h>
#include <linux/page-flags.h>
return 0;
/*
- * page_mapping() does not accept slab page
+ * page_mapping() does not accept slab pages.
*/
if (PageSlab(p))
return -EINVAL;
* signal.
*/
static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
- unsigned long pfn)
+ unsigned long pfn, struct page *page)
{
struct siginfo si;
int ret;
#ifdef __ARCH_SI_TRAPNO
si.si_trapno = trapno;
#endif
- si.si_addr_lsb = PAGE_SHIFT;
+ si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
/*
* Don't use force here, it's convenient if the signal
* can be temporarily blocked.
int nr;
do {
nr = shrink_slab(1000, GFP_KERNEL, 1000);
- if (page_count(p) == 0)
+ if (page_count(p) == 1)
break;
} while (nr > 10);
}
struct list_head nd;
struct task_struct *tsk;
unsigned long addr;
- unsigned addr_valid:1;
+ char addr_valid;
};
/*
* a SIGKILL because the error is not contained anymore.
*/
if (tk->addr == -EFAULT) {
- pr_debug("MCE: Unable to find user space address %lx in %s\n",
+ pr_info("MCE: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
tk->addr_valid = 0;
}
* wrong earlier.
*/
static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
- int fail, unsigned long pfn)
+ int fail, struct page *page, unsigned long pfn)
{
struct to_kill *tk, *next;
* process anyways.
*/
else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
- pfn) < 0)
+ pfn, page) < 0)
printk(KERN_ERR
"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
pfn, err);
} else if (page_has_private(p) &&
!try_to_release_page(p, GFP_NOIO)) {
- pr_debug("MCE %#lx: failed to release buffers\n", pfn);
+ pr_info("MCE %#lx: failed to release buffers\n", pfn);
} else {
ret = RECOVERED;
}
* Issues:
* - Error on hugepage is contained in hugepage unit (not in raw page unit.)
* To narrow down kill region to one page, we need to break up pmd.
- * - To support soft-offlining for hugepage, we need to support hugepage
- * migration.
*/
static int me_huge_page(struct page *p, unsigned long pfn)
{
+ int res = 0;
struct page *hpage = compound_head(p);
/*
* We can safely recover from error on free or reserved (i.e.
* so there is no race between isolation and mapping/unmapping.
*/
if (!(page_mapping(hpage) || PageAnon(hpage))) {
- __isolate_hwpoisoned_huge_page(hpage);
- return RECOVERED;
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ if (!res)
+ return RECOVERED;
}
return DELAYED;
}
return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY;
}
-#define N_UNMAP_TRIES 5
-
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
struct address_space *mapping;
LIST_HEAD(tokill);
int ret;
- int i;
int kill = 1;
struct page *hpage = compound_head(p);
if (kill)
collect_procs(hpage, &tokill);
- /*
- * try_to_unmap can fail temporarily due to races.
- * Try a few times (RED-PEN better strategy?)
- */
- for (i = 0; i < N_UNMAP_TRIES; i++) {
- ret = try_to_unmap(hpage, ttu);
- if (ret == SWAP_SUCCESS)
- break;
- pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
- }
-
+ ret = try_to_unmap(hpage, ttu);
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
pfn, page_mapcount(hpage));
* any accesses to the poisoned memory.
*/
kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
- ret != SWAP_SUCCESS, pfn);
+ ret != SWAP_SUCCESS, p, pfn);
return ret;
}
* We need/can do nothing about count=0 pages.
* 1) it's a free page, and therefore in safe hand:
* prep_new_page() will be the gate keeper.
- * 2) it's part of a non-compound high order page.
+ * 2) it's a free hugepage, which is also safe:
+ * an affected hugepage will be dequeued from hugepage freelist,
+ * so there's no concern about reusing it ever after.
+ * 3) it's part of a non-compound high order page.
* Implies some kernel user: cannot stop them from
* R/W the page; let's pray that the page has been
* used and will be freed some time later.
if (is_free_buddy_page(p)) {
action_result(pfn, "free buddy", DELAYED);
return 0;
+ } else if (PageHuge(hpage)) {
+ /*
+ * Check "just unpoisoned", "filter hit", and
+ * "race with other subpage."
+ */
+ lock_page_nosync(hpage);
+ if (!PageHWPoison(hpage)
+ || (hwpoison_filter(p) && TestClearPageHWPoison(p))
+ || (p != hpage && TestSetPageHWPoison(hpage))) {
+ atomic_long_sub(nr_pages, &mce_bad_pages);
+ return 0;
+ }
+ set_page_hwpoison_huge_page(hpage);
+ res = dequeue_hwpoisoned_huge_page(hpage);
+ action_result(pfn, "free huge",
+ res ? IGNORED : DELAYED);
+ unlock_page(hpage);
+ return res;
} else {
action_result(pfn, "high order kernel", IGNORED);
return -EBUSY;
page = compound_head(p);
if (!PageHWPoison(p)) {
- pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn);
+ pr_info("MCE: Page was already unpoisoned %#lx\n", pfn);
return 0;
}
nr_pages = 1 << compound_order(page);
if (!get_page_unless_zero(page)) {
+ /*
+ * Since HWPoisoned hugepage should have non-zero refcount,
+ * race between memory failure and unpoison seems to happen.
+ * In such case unpoison fails and memory failure runs
+ * to the end.
+ */
+ if (PageHuge(page)) {
+ pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+ return 0;
+ }
if (TestClearPageHWPoison(p))
atomic_long_sub(nr_pages, &mce_bad_pages);
- pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
+ pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn);
return 0;
}
* the free buddy page pool.
*/
if (TestClearPageHWPoison(page)) {
- pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
+ pr_info("MCE: Software-unpoisoned page %#lx\n", pfn);
atomic_long_sub(nr_pages, &mce_bad_pages);
freeit = 1;
+ if (PageHuge(page))
+ clear_page_hwpoison_huge_page(page);
}
- if (PageHuge(p))
- clear_page_hwpoison_huge_page(page);
unlock_page(page);
put_page(page);
static struct page *new_page(struct page *p, unsigned long private, int **x)
{
int nid = page_to_nid(p);
- return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+ if (PageHuge(p))
+ return alloc_huge_page_node(page_hstate(compound_head(p)),
+ nid);
+ else
+ return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
}
/*
* was free.
*/
set_migratetype_isolate(p);
+ /*
+ * When the target page is a free hugepage, just remove it
+ * from free hugepage list.
+ */
if (!get_page_unless_zero(compound_head(p))) {
- if (is_free_buddy_page(p)) {
- pr_debug("get_any_page: %#lx free buddy page\n", pfn);
+ if (PageHuge(p)) {
+ pr_info("get_any_page: %#lx free huge page\n", pfn);
+ ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+ } else if (is_free_buddy_page(p)) {
+ pr_info("get_any_page: %#lx free buddy page\n", pfn);
/* Set hwpoison bit while page is still isolated */
SetPageHWPoison(p);
ret = 0;
} else {
- pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n",
+ pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n",
pfn, p->flags);
ret = -EIO;
}
return ret;
}
+static int soft_offline_huge_page(struct page *page, int flags)
+{
+ int ret;
+ unsigned long pfn = page_to_pfn(page);
+ struct page *hpage = compound_head(page);
+ LIST_HEAD(pagelist);
+
+ ret = get_any_page(page, pfn, flags);
+ if (ret < 0)
+ return ret;
+ if (ret == 0)
+ goto done;
+
+ if (PageHWPoison(hpage)) {
+ put_page(hpage);
+ pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+ return -EBUSY;
+ }
+
+ /* Keep page count to indicate a given hugepage is isolated. */
+
+ list_add(&hpage->lru, &pagelist);
+ ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+ if (ret) {
+ putback_lru_pages(&pagelist);
+ pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+ pfn, ret, page->flags);
+ if (ret > 0)
+ ret = -EIO;
+ return ret;
+ }
+done:
+ if (!PageHWPoison(hpage))
+ atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+ set_page_hwpoison_huge_page(hpage);
+ dequeue_hwpoisoned_huge_page(hpage);
+ /* keep elevated page count for bad page */
+ return ret;
+}
+
/**
* soft_offline_page - Soft offline a page.
* @page: page to offline
int ret;
unsigned long pfn = page_to_pfn(page);
+ if (PageHuge(page))
+ return soft_offline_huge_page(page, flags);
+
ret = get_any_page(page, pfn, flags);
if (ret < 0)
return ret;
goto done;
}
if (!PageLRU(page)) {
- pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n",
+ pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
pfn, page->flags);
return -EIO;
}
if (PageHWPoison(page)) {
unlock_page(page);
put_page(page);
- pr_debug("soft offline: %#lx page already poisoned\n", pfn);
+ pr_info("soft offline: %#lx page already poisoned\n", pfn);
return -EBUSY;
}
put_page(page);
if (ret == 1) {
ret = 0;
- pr_debug("soft_offline: %#lx: invalidated\n", pfn);
+ pr_info("soft_offline: %#lx: invalidated\n", pfn);
goto done;
}
list_add(&page->lru, &pagelist);
ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
if (ret) {
- pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+ pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
}
} else {
- pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
+ pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
pfn, ret, page_count(page), page->flags);
}
if (ret)