mm/migrate.c

   1 /*
   2  * Memory Migration functionality - linux/mm/migration.c
   3  *
   4  * Copyright (C) 2006 Silicon Graphics, Inc., Christoph Lameter
   5  *
   6  * Page migration was first developed in the context of the memory hotplug
   7  * project. The main authors of the migration code are:
   8  *
   9  * IWAMOTO Toshihiro <iwamoto@valinux.co.jp>
  10  * Hirokazu Takahashi <taka@valinux.co.jp>
  11  * Dave Hansen <haveblue@us.ibm.com>
  12  * Christoph Lameter <clameter@sgi.com>
  13  */
  14
  15 #include <linux/migrate.h>
  16 #include <linux/module.h>
  17 #include <linux/swap.h>
  18 #include <linux/swapops.h>
  19 #include <linux/pagemap.h>
  20 #include <linux/buffer_head.h>
  21 #include <linux/mm_inline.h>
  22 #include <linux/pagevec.h>
  23 #include <linux/rmap.h>
  24 #include <linux/topology.h>
  25 #include <linux/cpu.h>
  26 #include <linux/cpuset.h>
  27 #include <linux/writeback.h>
  28
  29 #include "internal.h"
  30
  31 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
  32
  33 /*
  34  * Isolate one page from the LRU lists. If successful put it onto
  35  * the indicated list with elevated page count.
  36  *
  37  * Result:
  38  *  -EBUSY: page not on LRU list
  39  *  0: page removed from LRU list and added to the specified list.
  40  */
  41 int isolate_lru_page(struct page *page, struct list_head *pagelist)
  42 {
  43         int ret = -EBUSY;
  44
  45         if (PageLRU(page)) {
  46                 struct zone *zone = page_zone(page);
  47
  48                 spin_lock_irq(&zone->lru_lock);
  49                 if (PageLRU(page)) {
  50                         ret = 0;
  51                         get_page(page);
  52                         ClearPageLRU(page);
  53                         if (PageActive(page))
  54                                 del_page_from_active_list(zone, page);
  55                         else
  56                                 del_page_from_inactive_list(zone, page);
  57                         list_add_tail(&page->lru, pagelist);
  58                 }
  59                 spin_unlock_irq(&zone->lru_lock);
  60         }
  61         return ret;
  62 }
  63
  64 /*
  65  * migrate_prep() needs to be called after we have compiled the list of pages
  66  * to be migrated using isolate_lru_page() but before we begin a series of calls
  67  * to migrate_pages().
  68  */
  69 int migrate_prep(void)
  70 {
  71         /*
  72          * Clear the LRU lists so pages can be isolated.
  73          * Note that pages may be moved off the LRU after we have
  74          * drained them. Those pages will fail to migrate like other
  75          * pages that may be busy.
  76          */
  77         lru_add_drain_all();
  78
  79         return 0;
  80 }
  81
  82 static inline void move_to_lru(struct page *page)
  83 {
  84         if (PageActive(page)) {
  85                 /*
  86                  * lru_cache_add_active checks that
  87                  * the PG_active bit is off.
  88                  */
  89                 ClearPageActive(page);
  90                 lru_cache_add_active(page);
  91         } else {
  92                 lru_cache_add(page);
  93         }
  94         put_page(page);
  95 }
  96
  97 /*
  98  * Add isolated pages on the list back to the LRU.
  99  *
 100  * returns the number of pages put back.
 101  */
 102 int putback_lru_pages(struct list_head *l)
 103 {
 104         struct page *page;
 105         struct page *page2;
 106         int count = 0;
 107
 108         list_for_each_entry_safe(page, page2, l, lru) {
 109                 list_del(&page->lru);
 110                 move_to_lru(page);
 111                 count++;
 112         }
 113         return count;
 114 }
 115
 116 static inline int is_swap_pte(pte_t pte)
 117 {
 118         return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
 119 }
 120
 121 /*
 122  * Restore a potential migration pte to a working pte entry
 123  */
 124 static void remove_migration_pte(struct vm_area_struct *vma,
 125                 struct page *old, struct page *new)
 126 {
 127         struct mm_struct *mm = vma->vm_mm;
 128         swp_entry_t entry;
 129         pgd_t *pgd;
 130         pud_t *pud;
 131         pmd_t *pmd;
 132         pte_t *ptep, pte;
 133         spinlock_t *ptl;
 134         unsigned long addr = page_address_in_vma(new, vma);
 135
 136         if (addr == -EFAULT)
 137                 return;
 138
 139         pgd = pgd_offset(mm, addr);
 140         if (!pgd_present(*pgd))
 141                 return;
 142
 143         pud = pud_offset(pgd, addr);
 144         if (!pud_present(*pud))
 145                 return;
 146
 147         pmd = pmd_offset(pud, addr);
 148         if (!pmd_present(*pmd))
 149                 return;
 150
 151         ptep = pte_offset_map(pmd, addr);
 152
 153         if (!is_swap_pte(*ptep)) {
 154                 pte_unmap(ptep);
 155                 return;
 156         }
 157
 158         ptl = pte_lockptr(mm, pmd);
 159         spin_lock(ptl);
 160         pte = *ptep;
 161         if (!is_swap_pte(pte))
 162                 goto out;
 163
 164         entry = pte_to_swp_entry(pte);
 165
 166         if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
 167                 goto out;
 168
 169         get_page(new);
 170         pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
 171         if (is_write_migration_entry(entry))
 172                 pte = pte_mkwrite(pte);
 173         set_pte_at(mm, addr, ptep, pte);
 174
 175         if (PageAnon(new))
 176                 page_add_anon_rmap(new, vma, addr);
 177         else
 178                 page_add_file_rmap(new);
 179
 180         /* No need to invalidate - it was non-present before */
 181         update_mmu_cache(vma, addr, pte);
 182         lazy_mmu_prot_update(pte);
 183
 184 out:
 185         pte_unmap_unlock(ptep, ptl);
 186 }
 187
 188 /*
 189  * Note that remove_file_migration_ptes will only work on regular mappings,
 190  * Nonlinear mappings do not use migration entries.
 191  */
 192 static void remove_file_migration_ptes(struct page *old, struct page *new)
 193 {
 194         struct vm_area_struct *vma;
 195         struct address_space *mapping = page_mapping(new);
 196         struct prio_tree_iter iter;
 197         pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
 198
 199         if (!mapping)
 200                 return;
 201
 202         spin_lock(&mapping->i_mmap_lock);
 203
 204         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff)
 205                 remove_migration_pte(vma, old, new);
 206
 207         spin_unlock(&mapping->i_mmap_lock);
 208 }
 209
 210 /*
 211  * Must hold mmap_sem lock on at least one of the vmas containing
 212  * the page so that the anon_vma cannot vanish.
 213  */
 214 static void remove_anon_migration_ptes(struct page *old, struct page *new)
 215 {
 216         struct anon_vma *anon_vma;
 217         struct vm_area_struct *vma;
 218         unsigned long mapping;
 219
 220         mapping = (unsigned long)new->mapping;
 221
 222         if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0)
 223                 return;
 224
 225         /*
 226          * We hold the mmap_sem lock. So no need to call page_lock_anon_vma.
 227          */
 228         anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON);
 229         spin_lock(&anon_vma->lock);
 230
 231         list_for_each_entry(vma, &anon_vma->head, anon_vma_node)
 232                 remove_migration_pte(vma, old, new);
 233
 234         spin_unlock(&anon_vma->lock);
 235 }
 236
 237 /*
 238  * Get rid of all migration entries and replace them by
 239  * references to the indicated page.
 240  */
 241 static void remove_migration_ptes(struct page *old, struct page *new)
 242 {
 243         if (PageAnon(new))
 244                 remove_anon_migration_ptes(old, new);
 245         else
 246                 remove_file_migration_ptes(old, new);
 247 }
 248
 249 /*
 250  * Something used the pte of a page under migration. We need to
 251  * get to the page and wait until migration is finished.
 252  * When we return from this function the fault will be retried.
 253  *
 254  * This function is called from do_swap_page().
 255  */
 256 void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
 257                                 unsigned long address)
 258 {
 259         pte_t *ptep, pte;
 260         spinlock_t *ptl;
 261         swp_entry_t entry;
 262         struct page *page;
 263
 264         ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
 265         pte = *ptep;
 266         if (!is_swap_pte(pte))
 267                 goto out;
 268
 269         entry = pte_to_swp_entry(pte);
 270         if (!is_migration_entry(entry))
 271                 goto out;
 272
 273         page = migration_entry_to_page(entry);
 274
 275         get_page(page);
 276         pte_unmap_unlock(ptep, ptl);
 277         wait_on_page_locked(page);
 278         put_page(page);
 279         return;
 280 out:
 281         pte_unmap_unlock(ptep, ptl);
 282 }
 283
 284 /*
 285  * Replace the page in the mapping.
 286  *
 287  * The number of remaining references must be:
 288  * 1 for anonymous pages without a mapping
 289  * 2 for pages with a mapping
 290  * 3 for pages with a mapping and PagePrivate set.
 291  */
 292 static int migrate_page_move_mapping(struct address_space *mapping,
 293                 struct page *newpage, struct page *page)
 294 {
 295         struct page **radix_pointer;
 296
 297         if (!mapping) {
 298                 /* Anonymous page */
 299                 if (page_count(page) != 1)
 300                         return -EAGAIN;
 301                 return 0;
 302         }
 303
 304         write_lock_irq(&mapping->tree_lock);
 305
 306         radix_pointer = (struct page **)radix_tree_lookup_slot(
 307                                                 &mapping->page_tree,
 308                                                 page_index(page));
 309
 310         if (page_count(page) != 2 + !!PagePrivate(page) ||
 311                         *radix_pointer != page) {
 312                 write_unlock_irq(&mapping->tree_lock);
 313                 return -EAGAIN;
 314         }
 315
 316         /*
 317          * Now we know that no one else is looking at the page.
 318          */
 319         get_page(newpage);
 320 #ifdef CONFIG_SWAP
 321         if (PageSwapCache(page)) {
 322                 SetPageSwapCache(newpage);
 323                 set_page_private(newpage, page_private(page));
 324         }
 325 #endif
 326
 327         *radix_pointer = newpage;
 328         __put_page(page);
 329         write_unlock_irq(&mapping->tree_lock);
 330
 331         return 0;
 332 }
 333
 334 /*
 335  * Copy the page to its new location
 336  */
 337 static void migrate_page_copy(struct page *newpage, struct page *page)
 338 {
 339         copy_highpage(newpage, page);
 340
 341         if (PageError(page))
 342                 SetPageError(newpage);
 343         if (PageReferenced(page))
 344                 SetPageReferenced(newpage);
 345         if (PageUptodate(page))
 346                 SetPageUptodate(newpage);
 347         if (PageActive(page))
 348                 SetPageActive(newpage);
 349         if (PageChecked(page))
 350                 SetPageChecked(newpage);
 351         if (PageMappedToDisk(page))
 352                 SetPageMappedToDisk(newpage);
 353
 354         if (PageDirty(page)) {
 355                 clear_page_dirty_for_io(page);
 356                 set_page_dirty(newpage);
 357         }
 358
 359 #ifdef CONFIG_SWAP
 360         ClearPageSwapCache(page);
 361 #endif
 362         ClearPageActive(page);
 363         ClearPagePrivate(page);
 364         set_page_private(page, 0);
 365         page->mapping = NULL;
 366
 367         /*
 368          * If any waiters have accumulated on the new page then
 369          * wake them up.
 370          */
 371         if (PageWriteback(newpage))
 372                 end_page_writeback(newpage);
 373 }
 374
 375 /************************************************************
 376  *                    Migration functions
 377  ***********************************************************/
 378
 379 /* Always fail migration. Used for mappings that are not movable */
 380 int fail_migrate_page(struct address_space *mapping,
 381                         struct page *newpage, struct page *page)
 382 {
 383         return -EIO;
 384 }
 385 EXPORT_SYMBOL(fail_migrate_page);
 386
 387 /*
 388  * Common logic to directly migrate a single page suitable for
 389  * pages that do not use PagePrivate.
 390  *
 391  * Pages are locked upon entry and exit.
 392  */
 393 int migrate_page(struct address_space *mapping,
 394                 struct page *newpage, struct page *page)
 395 {
 396         int rc;
 397
 398         BUG_ON(PageWriteback(page));    /* Writeback must be complete */
 399
 400         rc = migrate_page_move_mapping(mapping, newpage, page);
 401
 402         if (rc)
 403                 return rc;
 404
 405         migrate_page_copy(newpage, page);
 406         return 0;
 407 }
 408 EXPORT_SYMBOL(migrate_page);
 409
 410 /*
 411  * Migration function for pages with buffers. This function can only be used
 412  * if the underlying filesystem guarantees that no other references to "page"
 413  * exist.
 414  */
 415 int buffer_migrate_page(struct address_space *mapping,
 416                 struct page *newpage, struct page *page)
 417 {
 418         struct buffer_head *bh, *head;
 419         int rc;
 420
 421         if (!page_has_buffers(page))
 422                 return migrate_page(mapping, newpage, page);
 423
 424         head = page_buffers(page);
 425
 426         rc = migrate_page_move_mapping(mapping, newpage, page);
 427
 428         if (rc)
 429                 return rc;
 430
 431         bh = head;
 432         do {
 433                 get_bh(bh);
 434                 lock_buffer(bh);
 435                 bh = bh->b_this_page;
 436
 437         } while (bh != head);
 438
 439         ClearPagePrivate(page);
 440         set_page_private(newpage, page_private(page));
 441         set_page_private(page, 0);
 442         put_page(page);
 443         get_page(newpage);
 444
 445         bh = head;
 446         do {
 447                 set_bh_page(bh, newpage, bh_offset(bh));
 448                 bh = bh->b_this_page;
 449
 450         } while (bh != head);
 451
 452         SetPagePrivate(newpage);
 453
 454         migrate_page_copy(newpage, page);
 455
 456         bh = head;
 457         do {
 458                 unlock_buffer(bh);
 459                 put_bh(bh);
 460                 bh = bh->b_this_page;
 461
 462         } while (bh != head);
 463
 464         return 0;
 465 }
 466 EXPORT_SYMBOL(buffer_migrate_page);
 467
 468 /*
 469  * Writeback a page to clean the dirty state
 470  */
 471 static int writeout(struct address_space *mapping, struct page *page)
 472 {
 473         struct writeback_control wbc = {
 474                 .sync_mode = WB_SYNC_NONE,
 475                 .nr_to_write = 1,
 476                 .range_start = 0,
 477                 .range_end = LLONG_MAX,
 478                 .nonblocking = 1,
 479                 .for_reclaim = 1
 480         };
 481         int rc;
 482
 483         if (!mapping->a_ops->writepage)
 484                 /* No write method for the address space */
 485                 return -EINVAL;
 486
 487         if (!clear_page_dirty_for_io(page))
 488                 /* Someone else already triggered a write */
 489                 return -EAGAIN;
 490
 491         /*
 492          * A dirty page may imply that the underlying filesystem has
 493          * the page on some queue. So the page must be clean for
 494          * migration. Writeout may mean we loose the lock and the
 495          * page state is no longer what we checked for earlier.
 496          * At this point we know that the migration attempt cannot
 497          * be successful.
 498          */
 499         remove_migration_ptes(page, page);
 500
 501         rc = mapping->a_ops->writepage(page, &wbc);
 502         if (rc < 0)
 503                 /* I/O Error writing */
 504                 return -EIO;
 505
 506         if (rc != AOP_WRITEPAGE_ACTIVATE)
 507                 /* unlocked. Relock */
 508                 lock_page(page);
 509
 510         return -EAGAIN;
 511 }
 512
 513 /*
 514  * Default handling if a filesystem does not provide a migration function.
 515  */
 516 static int fallback_migrate_page(struct address_space *mapping,
 517         struct page *newpage, struct page *page)
 518 {
 519         if (PageDirty(page))
 520                 return writeout(mapping, page);
 521
 522         /*
 523          * Buffers may be managed in a filesystem specific way.
 524          * We must have no buffers or drop them.
 525          */
 526         if (page_has_buffers(page) &&
 527             !try_to_release_page(page, GFP_KERNEL))
 528                 return -EAGAIN;
 529
 530         return migrate_page(mapping, newpage, page);
 531 }
 532
 533 /*
 534  * Move a page to a newly allocated page
 535  * The page is locked and all ptes have been successfully removed.
 536  *
 537  * The new page will have replaced the old page if this function
 538  * is successful.
 539  */
 540 static int move_to_new_page(struct page *newpage, struct page *page)
 541 {
 542         struct address_space *mapping;
 543         int rc;
 544
 545         /*
 546          * Block others from accessing the page when we get around to
 547          * establishing additional references. We are the only one
 548          * holding a reference to the new page at this point.
 549          */
 550         if (TestSetPageLocked(newpage))
 551                 BUG();
 552
 553         /* Prepare mapping for the new page.*/
 554         newpage->index = page->index;
 555         newpage->mapping = page->mapping;
 556
 557         mapping = page_mapping(page);
 558         if (!mapping)
 559                 rc = migrate_page(mapping, newpage, page);
 560         else if (mapping->a_ops->migratepage)
 561                 /*
 562                  * Most pages have a mapping and most filesystems
 563                  * should provide a migration function. Anonymous
 564                  * pages are part of swap space which also has its
 565                  * own migration function. This is the most common
 566                  * path for page migration.
 567                  */
 568                 rc = mapping->a_ops->migratepage(mapping,
 569                                                 newpage, page);
 570         else
 571                 rc = fallback_migrate_page(mapping, newpage, page);
 572
 573         if (!rc)
 574                 remove_migration_ptes(page, newpage);
 575         else
 576                 newpage->mapping = NULL;
 577
 578         unlock_page(newpage);
 579
 580         return rc;
 581 }
 582
 583 /*
 584  * Obtain the lock on page, remove all ptes and migrate the page
 585  * to the newly allocated page in newpage.
 586  */
 587 static int unmap_and_move(new_page_t get_new_page, unsigned long private,
 588                         struct page *page, int force)
 589 {
 590         int rc = 0;
 591         struct page *newpage = get_new_page(page, private);
 592
 593         if (!newpage)
 594                 return -ENOMEM;
 595
 596         if (page_count(page) == 1)
 597                 /* page was freed from under us. So we are done. */
 598                 goto move_newpage;
 599
 600         rc = -EAGAIN;
 601         if (TestSetPageLocked(page)) {
 602                 if (!force)
 603                         goto move_newpage;
 604                 lock_page(page);
 605         }
 606
 607         if (PageWriteback(page)) {
 608                 if (!force)
 609                         goto unlock;
 610                 wait_on_page_writeback(page);
 611         }
 612
 613         /*
 614          * Establish migration ptes or remove ptes
 615          */
 616         if (try_to_unmap(page, 1) != SWAP_FAIL) {
 617                 if (!page_mapped(page))
 618                         rc = move_to_new_page(newpage, page);
 619         } else
 620                 /* A vma has VM_LOCKED set -> permanent failure */
 621                 rc = -EPERM;
 622
 623         if (rc)
 624                 remove_migration_ptes(page, page);
 625 unlock:
 626         unlock_page(page);
 627
 628         if (rc != -EAGAIN) {
 629                 /*
 630                  * A page that has been migrated has all references
 631                  * removed and will be freed. A page that has not been
 632                  * migrated will have kepts its references and be
 633                  * restored.
 634                  */
 635                 list_del(&page->lru);
 636                 move_to_lru(page);
 637         }
 638
 639 move_newpage:
 640         /*
 641          * Move the new page to the LRU. If migration was not successful
 642          * then this will free the page.
 643          */
 644         move_to_lru(newpage);
 645         return rc;
 646 }
 647
 648 /*
 649  * migrate_pages
 650  *
 651  * The function takes one list of pages to migrate and a function
 652  * that determines from the page to be migrated and the private data
 653  * the target of the move and allocates the page.
 654  *
 655  * The function returns after 10 attempts or if no pages
 656  * are movable anymore because to has become empty
 657  * or no retryable pages exist anymore. All pages will be
 658  * retruned to the LRU or freed.
 659  *
 660  * Return: Number of pages not migrated or error code.
 661  */
 662 int migrate_pages(struct list_head *from,
 663                 new_page_t get_new_page, unsigned long private)
 664 {
 665         int retry = 1;
 666         int nr_failed = 0;
 667         int pass = 0;
 668         struct page *page;
 669         struct page *page2;
 670         int swapwrite = current->flags & PF_SWAPWRITE;
 671         int rc;
 672
 673         if (!swapwrite)
 674                 current->flags |= PF_SWAPWRITE;
 675
 676         for(pass = 0; pass < 10 && retry; pass++) {
 677                 retry = 0;
 678
 679                 list_for_each_entry_safe(page, page2, from, lru) {
 680                         cond_resched();
 681
 682                         rc = unmap_and_move(get_new_page, private,
 683                                                 page, pass > 2);
 684
 685                         switch(rc) {
 686                         case -ENOMEM:
 687                                 goto out;
 688                         case -EAGAIN:
 689                                 retry++;
 690                                 break;
 691                         case 0:
 692                                 break;
 693                         default:
 694                                 /* Permanent failure */
 695                                 nr_failed++;
 696                                 break;
 697                         }
 698                 }
 699         }
 700         rc = 0;
 701 out:
 702         if (!swapwrite)
 703                 current->flags &= ~PF_SWAPWRITE;
 704
 705         putback_lru_pages(from);
 706
 707         if (rc)
 708                 return rc;
 709
 710         return nr_failed + retry;
 711 }
 712