mm/migrate.c

   1 /*
   2  * Memory Migration functionality - linux/mm/migration.c
   3  *
   4  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   5  *
   6  * Page migration was first developed in the context of the memory hotplug
   7  * project. The main authors of the migration code are:
   8  *
   9  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10  * Hirokazu Takahashi <taka@valinux.co.jp>
  11  * Dave Hansen <haveblue@us.ibm.com>
  12  * Christoph Lameter <clameter@sgi.com>
  13  */
  14
  15 #include <linux/migrate.h>
  16 #include <linux/module.h>
  17 #include <linux/swap.h>
  18 #include <linux/pagemap.h>
  19 #include <linux/buffer_head.h>
  20 #include <linux/mm_inline.h>
  21 #include <linux/pagevec.h>
  22 #include <linux/rmap.h>
  23 #include <linux/topology.h>
  24 #include <linux/cpu.h>
  25 #include <linux/cpuset.h>
  26 #include <linux/swapops.h>
  27
  28 #include "internal.h"
  29
  30 /* The maximum number of pages to take off the LRU for migration */
  31 #define MIGRATE_CHUNK_SIZE 256
  32
  33 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  34
  35 /*
  36  * Isolate one page from the LRU lists. If successful put it onto
  37  * the indicated list with elevated page count.
  38  *
  39  * Result:
  40  *  -EBUSY: page not on LRU list
  41  *  0: page removed from LRU list and added to the specified list.
  42  */
  43 int isolate_lru_page(struct page *page, struct list_head *pagelist)
  44 {
  45         int ret = -EBUSY;
  46
  47         if (PageLRU(page)) {
  48                 struct zone *zone = page_zone(page);
  49
  50                 spin_lock_irq(&zone->lru_lock);
  51                 if (PageLRU(page)) {
  52                         ret = 0;
  53                         get_page(page);
  54                         ClearPageLRU(page);
  55                         if (PageActive(page))
  56                                 del_page_from_active_list(zone, page);
  57                         else
  58                                 del_page_from_inactive_list(zone, page);
  59                         list_add_tail(&page->lru, pagelist);
  60                 }
  61                 spin_unlock_irq(&zone->lru_lock);
  62         }
  63         return ret;
  64 }
  65
  66 /*
  67  * migrate_prep() needs to be called after we have compiled the list of pages
  68  * to be migrated using isolate_lru_page() but before we begin a series of calls
  69  * to migrate_pages().
  70  */
  71 int migrate_prep(void)
  72 {
  73         /* Must have swap device for migration */
  74         if (nr_swap_pages <= 0)
  75                 return -ENODEV;
  76
  77         /*
  78          * Clear the LRU lists so pages can be isolated.
  79          * Note that pages may be moved off the LRU after we have
  80          * drained them. Those pages will fail to migrate like other
  81          * pages that may be busy.
  82          */
  83         lru_add_drain_all();
  84
  85         return 0;
  86 }
  87
  88 static inline void move_to_lru(struct page *page)
  89 {
  90         list_del(&page->lru);
  91         if (PageActive(page)) {
  92                 /*
  93                  * lru_cache_add_active checks that
  94                  * the PG_active bit is off.
  95                  */
  96                 ClearPageActive(page);
  97                 lru_cache_add_active(page);
  98         } else {
  99                 lru_cache_add(page);
 100         }
 101         put_page(page);
 102 }
 103
 104 /*
 105  * Add isolated pages on the list back to the LRU.
 106  *
 107  * returns the number of pages put back.
 108  */
 109 int putback_lru_pages(struct list_head *l)
 110 {
 111         struct page *page;
 112         struct page *page2;
 113         int count = 0;
 114
 115         list_for_each_entry_safe(page, page2, l, lru) {
 116                 move_to_lru(page);
 117                 count++;
 118         }
 119         return count;
 120 }
 121
 122 /*
 123  * swapout a single page
 124  * page is locked upon entry, unlocked on exit
 125  */
 126 static int swap_page(struct page *page)
 127 {
 128         struct address_space *mapping = page_mapping(page);
 129
 130         if (page_mapped(page) && mapping)
 131                 if (try_to_unmap(page, 1) != SWAP_SUCCESS)
 132                         goto unlock_retry;
 133
 134         if (PageDirty(page)) {
 135                 /* Page is dirty, try to write it out here */
 136                 switch(pageout(page, mapping)) {
 137                 case PAGE_KEEP:
 138                 case PAGE_ACTIVATE:
 139                         goto unlock_retry;
 140
 141                 case PAGE_SUCCESS:
 142                         goto retry;
 143
 144                 case PAGE_CLEAN:
 145                         ; /* try to free the page below */
 146                 }
 147         }
 148
 149         if (PagePrivate(page)) {
 150                 if (!try_to_release_page(page, GFP_KERNEL) ||
 151                     (!mapping && page_count(page) == 1))
 152                         goto unlock_retry;
 153         }
 154
 155         if (remove_mapping(mapping, page)) {
 156                 /* Success */
 157                 unlock_page(page);
 158                 return 0;
 159         }
 160
 161 unlock_retry:
 162         unlock_page(page);
 163
 164 retry:
 165         return -EAGAIN;
 166 }
 167
 168 /*
 169  * Replace the page in the mapping.
 170  *
 171  * The number of remaining references must be:
 172  * 1 for anonymous pages without a mapping
 173  * 2 for pages with a mapping
 174  * 3 for pages with a mapping and PagePrivate set.
 175  */
 176 static int migrate_page_move_mapping(struct page *newpage,
 177                                 struct page *page)
 178 {
 179         struct address_space *mapping = page_mapping(page);
 180         struct page **radix_pointer;
 181
 182         if (!mapping)
 183                 return -EAGAIN;
 184
 185         write_lock_irq(&mapping->tree_lock);
 186
 187         radix_pointer = (struct page **)radix_tree_lookup_slot(
 188                                                 &mapping->page_tree,
 189                                                 page_index(page));
 190
 191         if (!page_mapping(page) ||
 192                         page_count(page) != 2 + !!PagePrivate(page) ||
 193                         *radix_pointer != page) {
 194                 write_unlock_irq(&mapping->tree_lock);
 195                 return -EAGAIN;
 196         }
 197
 198         /*
 199          * Now we know that no one else is looking at the page.
 200          *
 201          * Certain minimal information about a page must be available
 202          * in order for other subsystems to properly handle the page if they
 203          * find it through the radix tree update before we are finished
 204          * copying the page.
 205          */
 206         get_page(newpage);
 207         newpage->index = page->index;
 208         newpage->mapping = page->mapping;
 209         if (PageSwapCache(page)) {
 210                 SetPageSwapCache(newpage);
 211                 set_page_private(newpage, page_private(page));
 212         }
 213
 214         *radix_pointer = newpage;
 215         __put_page(page);
 216         write_unlock_irq(&mapping->tree_lock);
 217
 218         return 0;
 219 }
 220
 221 /*
 222  * Copy the page to its new location
 223  */
 224 static void migrate_page_copy(struct page *newpage, struct page *page)
 225 {
 226         copy_highpage(newpage, page);
 227
 228         if (PageError(page))
 229                 SetPageError(newpage);
 230         if (PageReferenced(page))
 231                 SetPageReferenced(newpage);
 232         if (PageUptodate(page))
 233                 SetPageUptodate(newpage);
 234         if (PageActive(page))
 235                 SetPageActive(newpage);
 236         if (PageChecked(page))
 237                 SetPageChecked(newpage);
 238         if (PageMappedToDisk(page))
 239                 SetPageMappedToDisk(newpage);
 240
 241         if (PageDirty(page)) {
 242                 clear_page_dirty_for_io(page);
 243                 set_page_dirty(newpage);
 244         }
 245
 246         ClearPageSwapCache(page);
 247         ClearPageActive(page);
 248         ClearPagePrivate(page);
 249         set_page_private(page, 0);
 250         page->mapping = NULL;
 251
 252         /*
 253          * If any waiters have accumulated on the new page then
 254          * wake them up.
 255          */
 256         if (PageWriteback(newpage))
 257                 end_page_writeback(newpage);
 258 }
 259
 260 /************************************************************
 261  *                    Migration functions
 262  ***********************************************************/
 263
 264 /* Always fail migration. Used for mappings that are not movable */
 265 int fail_migrate_page(struct page *newpage, struct page *page)
 266 {
 267         return -EIO;
 268 }
 269 EXPORT_SYMBOL(fail_migrate_page);
 270
 271 /*
 272  * Common logic to directly migrate a single page suitable for
 273  * pages that do not use PagePrivate.
 274  *
 275  * Pages are locked upon entry and exit.
 276  */
 277 int migrate_page(struct page *newpage, struct page *page)
 278 {
 279         int rc;
 280
 281         BUG_ON(PageWriteback(page));    /* Writeback must be complete */
 282
 283         rc = migrate_page_move_mapping(newpage, page);
 284
 285         if (rc)
 286                 return rc;
 287
 288         migrate_page_copy(newpage, page);
 289
 290         /*
 291          * Remove auxiliary swap entries and replace
 292          * them with real ptes.
 293          *
 294          * Note that a real pte entry will allow processes that are not
 295          * waiting on the page lock to use the new page via the page tables
 296          * before the new page is unlocked.
 297          */
 298         remove_from_swap(newpage);
 299         return 0;
 300 }
 301 EXPORT_SYMBOL(migrate_page);
 302
 303 /*
 304  * Migration function for pages with buffers. This function can only be used
 305  * if the underlying filesystem guarantees that no other references to "page"
 306  * exist.
 307  */
 308 int buffer_migrate_page(struct page *newpage, struct page *page)
 309 {
 310         struct address_space *mapping = page->mapping;
 311         struct buffer_head *bh, *head;
 312         int rc;
 313
 314         if (!mapping)
 315                 return -EAGAIN;
 316
 317         if (!page_has_buffers(page))
 318                 return migrate_page(newpage, page);
 319
 320         head = page_buffers(page);
 321
 322         rc = migrate_page_move_mapping(newpage, page);
 323
 324         if (rc)
 325                 return rc;
 326
 327         bh = head;
 328         do {
 329                 get_bh(bh);
 330                 lock_buffer(bh);
 331                 bh = bh->b_this_page;
 332
 333         } while (bh != head);
 334
 335         ClearPagePrivate(page);
 336         set_page_private(newpage, page_private(page));
 337         set_page_private(page, 0);
 338         put_page(page);
 339         get_page(newpage);
 340
 341         bh = head;
 342         do {
 343                 set_bh_page(bh, newpage, bh_offset(bh));
 344                 bh = bh->b_this_page;
 345
 346         } while (bh != head);
 347
 348         SetPagePrivate(newpage);
 349
 350         migrate_page_copy(newpage, page);
 351
 352         bh = head;
 353         do {
 354                 unlock_buffer(bh);
 355                 put_bh(bh);
 356                 bh = bh->b_this_page;
 357
 358         } while (bh != head);
 359
 360         return 0;
 361 }
 362 EXPORT_SYMBOL(buffer_migrate_page);
 363
 364 /*
 365  * migrate_pages
 366  *
 367  * Two lists are passed to this function. The first list
 368  * contains the pages isolated from the LRU to be migrated.
 369  * The second list contains new pages that the pages isolated
 370  * can be moved to. If the second list is NULL then all
 371  * pages are swapped out.
 372  *
 373  * The function returns after 10 attempts or if no pages
 374  * are movable anymore because to has become empty
 375  * or no retryable pages exist anymore.
 376  *
 377  * Return: Number of pages not migrated when "to" ran empty.
 378  */
 379 int migrate_pages(struct list_head *from, struct list_head *to,
 380                   struct list_head *moved, struct list_head *failed)
 381 {
 382         int retry;
 383         int nr_failed = 0;
 384         int pass = 0;
 385         struct page *page;
 386         struct page *page2;
 387         int swapwrite = current->flags & PF_SWAPWRITE;
 388         int rc;
 389
 390         if (!swapwrite)
 391                 current->flags |= PF_SWAPWRITE;
 392
 393 redo:
 394         retry = 0;
 395
 396         list_for_each_entry_safe(page, page2, from, lru) {
 397                 struct page *newpage = NULL;
 398                 struct address_space *mapping;
 399
 400                 cond_resched();
 401
 402                 rc = 0;
 403                 if (page_count(page) == 1)
 404                         /* page was freed from under us. So we are done. */
 405                         goto next;
 406
 407                 if (to && list_empty(to))
 408                         break;
 409
 410                 /*
 411                  * Skip locked pages during the first two passes to give the
 412                  * functions holding the lock time to release the page. Later we
 413                  * use lock_page() to have a higher chance of acquiring the
 414                  * lock.
 415                  */
 416                 rc = -EAGAIN;
 417                 if (pass > 2)
 418                         lock_page(page);
 419                 else
 420                         if (TestSetPageLocked(page))
 421                                 goto next;
 422
 423                 /*
 424                  * Only wait on writeback if we have already done a pass where
 425                  * we we may have triggered writeouts for lots of pages.
 426                  */
 427                 if (pass > 0) {
 428                         wait_on_page_writeback(page);
 429                 } else {
 430                         if (PageWriteback(page))
 431                                 goto unlock_page;
 432                 }
 433
 434                 /*
 435                  * Anonymous pages must have swap cache references otherwise
 436                  * the information contained in the page maps cannot be
 437                  * preserved.
 438                  */
 439                 if (PageAnon(page) && !PageSwapCache(page)) {
 440                         if (!add_to_swap(page, GFP_KERNEL)) {
 441                                 rc = -ENOMEM;
 442                                 goto unlock_page;
 443                         }
 444                 }
 445
 446                 if (!to) {
 447                         rc = swap_page(page);
 448                         goto next;
 449                 }
 450
 451                 newpage = lru_to_page(to);
 452                 lock_page(newpage);
 453
 454                 /*
 455                  * Establish swap ptes for anonymous pages or destroy pte
 456                  * maps for files.
 457                  *
 458                  * In order to reestablish file backed mappings the fault handlers
 459                  * will take the radix tree_lock which may then be used to stop
 460                  * processses from accessing this page until the new page is ready.
 461                  *
 462                  * A process accessing via a swap pte (an anonymous page) will take a
 463                  * page_lock on the old page which will block the process until the
 464                  * migration attempt is complete. At that time the PageSwapCache bit
 465                  * will be examined. If the page was migrated then the PageSwapCache
 466                  * bit will be clear and the operation to retrieve the page will be
 467                  * retried which will find the new page in the radix tree. Then a new
 468                  * direct mapping may be generated based on the radix tree contents.
 469                  *
 470                  * If the page was not migrated then the PageSwapCache bit
 471                  * is still set and the operation may continue.
 472                  */
 473                 rc = -EPERM;
 474                 if (try_to_unmap(page, 1) == SWAP_FAIL)
 475                         /* A vma has VM_LOCKED set -> permanent failure */
 476                         goto unlock_both;
 477
 478                 rc = -EAGAIN;
 479                 if (page_mapped(page))
 480                         goto unlock_both;
 481                 /*
 482                  * Pages are properly locked and writeback is complete.
 483                  * Try to migrate the page.
 484                  */
 485                 mapping = page_mapping(page);
 486                 if (!mapping)
 487                         goto unlock_both;
 488
 489                 if (mapping->a_ops->migratepage) {
 490                         /*
 491                          * Most pages have a mapping and most filesystems
 492                          * should provide a migration function. Anonymous
 493                          * pages are part of swap space which also has its
 494                          * own migration function. This is the most common
 495                          * path for page migration.
 496                          */
 497                         rc = mapping->a_ops->migratepage(newpage, page);
 498                         goto unlock_both;
 499                 }
 500
 501                 /*
 502                  * Default handling if a filesystem does not provide
 503                  * a migration function. We can only migrate clean
 504                  * pages so try to write out any dirty pages first.
 505                  */
 506                 if (PageDirty(page)) {
 507                         switch (pageout(page, mapping)) {
 508                         case PAGE_KEEP:
 509                         case PAGE_ACTIVATE:
 510                                 goto unlock_both;
 511
 512                         case PAGE_SUCCESS:
 513                                 unlock_page(newpage);
 514                                 goto next;
 515
 516                         case PAGE_CLEAN:
 517                                 ; /* try to migrate the page below */
 518                         }
 519                 }
 520
 521                 /*
 522                  * Buffers are managed in a filesystem specific way.
 523                  * We must have no buffers or drop them.
 524                  */
 525                 if (!page_has_buffers(page) ||
 526                     try_to_release_page(page, GFP_KERNEL)) {
 527                         rc = migrate_page(newpage, page);
 528                         goto unlock_both;
 529                 }
 530
 531                 /*
 532                  * On early passes with mapped pages simply
 533                  * retry. There may be a lock held for some
 534                  * buffers that may go away. Later
 535                  * swap them out.
 536                  */
 537                 if (pass > 4) {
 538                         /*
 539                          * Persistently unable to drop buffers..... As a
 540                          * measure of last resort we fall back to
 541                          * swap_page().
 542                          */
 543                         unlock_page(newpage);
 544                         newpage = NULL;
 545                         rc = swap_page(page);
 546                         goto next;
 547                 }
 548
 549 unlock_both:
 550                 unlock_page(newpage);
 551
 552 unlock_page:
 553                 unlock_page(page);
 554
 555 next:
 556                 if (rc == -EAGAIN) {
 557                         retry++;
 558                 } else if (rc) {
 559                         /* Permanent failure */
 560                         list_move(&page->lru, failed);
 561                         nr_failed++;
 562                 } else {
 563                         if (newpage) {
 564                                 /* Successful migration. Return page to LRU */
 565                                 move_to_lru(newpage);
 566                         }
 567                         list_move(&page->lru, moved);
 568                 }
 569         }
 570         if (retry && pass++ < 10)
 571                 goto redo;
 572
 573         if (!swapwrite)
 574                 current->flags &= ~PF_SWAPWRITE;
 575
 576         return nr_failed + retry;
 577 }
 578
 579 /*
 580  * Migrate the list 'pagelist' of pages to a certain destination.
 581  *
 582  * Specify destination with either non-NULL vma or dest_node >= 0
 583  * Return the number of pages not migrated or error code
 584  */
 585 int migrate_pages_to(struct list_head *pagelist,
 586                         struct vm_area_struct *vma, int dest)
 587 {
 588         LIST_HEAD(newlist);
 589         LIST_HEAD(moved);
 590         LIST_HEAD(failed);
 591         int err = 0;
 592         unsigned long offset = 0;
 593         int nr_pages;
 594         struct page *page;
 595         struct list_head *p;
 596
 597 redo:
 598         nr_pages = 0;
 599         list_for_each(p, pagelist) {
 600                 if (vma) {
 601                         /*
 602                          * The address passed to alloc_page_vma is used to
 603                          * generate the proper interleave behavior. We fake
 604                          * the address here by an increasing offset in order
 605                          * to get the proper distribution of pages.
 606                          *
 607                          * No decision has been made as to which page
 608                          * a certain old page is moved to so we cannot
 609                          * specify the correct address.
 610                          */
 611                         page = alloc_page_vma(GFP_HIGHUSER, vma,
 612                                         offset + vma->vm_start);
 613                         offset += PAGE_SIZE;
 614                 }
 615                 else
 616                         page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
 617
 618                 if (!page) {
 619                         err = -ENOMEM;
 620                         goto out;
 621                 }
 622                 list_add_tail(&page->lru, &newlist);
 623                 nr_pages++;
 624                 if (nr_pages > MIGRATE_CHUNK_SIZE)
 625                         break;
 626         }
 627         err = migrate_pages(pagelist, &newlist, &moved, &failed);
 628
 629         putback_lru_pages(&moved);      /* Call release pages instead ?? */
 630
 631         if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
 632                 goto redo;
 633 out:
 634         /* Return leftover allocated pages */
 635         while (!list_empty(&newlist)) {
 636                 page = list_entry(newlist.next, struct page, lru);
 637                 list_del(&page->lru);
 638                 __free_page(page);
 639         }
 640         list_splice(&failed, pagelist);
 641         if (err < 0)
 642                 return err;
 643
 644         /* Calculate number of leftover pages */
 645         nr_pages = 0;
 646         list_for_each(p, pagelist)
 647                 nr_pages++;
 648         return nr_pages;
 649 }