writeback: do not sleep on the congestion queue if there are no congested BDIs or...

author Mel Gorman <mel@csn.ul.ie>

Tue, 26 Oct 2010 21:21:45 +0000 (14:21 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 26 Oct 2010 23:52:07 +0000 (16:52 -0700)
author Mel Gorman <mel@csn.ul.ie>
Tue, 26 Oct 2010 21:21:45 +0000 (14:21 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 26 Oct 2010 23:52:07 +0000 (16:52 -0700)
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h

index 35b00746c712fc11776e78345edf786b98f9063b..f1b402a50679c8f97d25404cd53e7bbe1277eb0d 100644 (file)
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -285,7 +285,7 @@ enum {
  void clear_bdi_congested(struct backing_dev_info *bdi, int sync);
  void set_bdi_congested(struct backing_dev_info *bdi, int sync);
  long congestion_wait(int sync, long timeout);
-
+long wait_iff_congested(struct zone *zone, int sync, long timeout);
  
  static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi)
  {
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index c3c17fb675eedb1e5c7ca5c10c697a7e40c8e797..39c24ebe9cfd4e75b8841deec06aa6d78c91c2ad 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -423,6 +423,9 @@ struct zone {
  typedef enum {
         ZONE_RECLAIM_LOCKED,            /* prevents concurrent reclaim */
         ZONE_OOM_LOCKED,                /* zone is in OOM killer zonelist */
+       ZONE_CONGESTED,                 /* zone has many dirty pages backed by
+                                        * a congested BDI
+                                        */
  } zone_flags_t;
  
  static inline void zone_set_flag(struct zone *zone, zone_flags_t flag)
@@ -440,6 +443,11 @@ static inline void zone_clear_flag(struct zone *zone, zone_flags_t flag)
         clear_bit(flag, &zone->flags);
  }
  
+static inline int zone_is_reclaim_congested(const struct zone *zone)
+{
+       return test_bit(ZONE_CONGESTED, &zone->flags);
+}
+
  static inline int zone_is_reclaim_locked(const struct zone *zone)
  {
         return test_bit(ZONE_RECLAIM_LOCKED, &zone->flags);
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h

index d2b2654606ecee415d4cd3cfab9b53259379bd00..89a2b2db43751686129d90f8d58fd3c08787726b 100644 (file)
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -179,6 +179,13 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait,
         TP_ARGS(usec_timeout, usec_delayed)
  );
  
+DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
+
+       TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
+
+       TP_ARGS(usec_timeout, usec_delayed)
+);
+
  #endif /* _TRACE_WRITEBACK_H */
  
  /* This part must be outside protection */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c

index 55627306abe0042968ab1bf70b17f9cd1a74fdec..5ad3c106606b5318437c46fb5fdc7b7bbc33451d 100644 (file)
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -729,6 +729,7 @@ static wait_queue_head_t congestion_wqh[2] = {
                 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
                 __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
         };
+static atomic_t nr_bdi_congested[2];
  
  void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
  {
@@ -736,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
         wait_queue_head_t *wqh = &congestion_wqh[sync];
  
         bit = sync ? BDI_sync_congested : BDI_async_congested;
-       clear_bit(bit, &bdi->state);
+       if (test_and_clear_bit(bit, &bdi->state))
+               atomic_dec(&nr_bdi_congested[sync]);
         smp_mb__after_clear_bit();
         if (waitqueue_active(wqh))
                 wake_up(wqh);
@@ -748,7 +750,8 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync)
         enum bdi_state bit;
  
         bit = sync ? BDI_sync_congested : BDI_async_congested;
-       set_bit(bit, &bdi->state);
+       if (!test_and_set_bit(bit, &bdi->state))
+               atomic_inc(&nr_bdi_congested[sync]);
  }
  EXPORT_SYMBOL(set_bdi_congested);
  
@@ -779,3 +782,57 @@ long congestion_wait(int sync, long timeout)
  }
  EXPORT_SYMBOL(congestion_wait);
  
+/**
+ * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes
+ * @zone: A zone to check if it is heavily congested
+ * @sync: SYNC or ASYNC IO
+ * @timeout: timeout in jiffies
+ *
+ * In the event of a congested backing_dev (any backing_dev) and the given
+ * @zone has experienced recent congestion, this waits for up to @timeout
+ * jiffies for either a BDI to exit congestion of the given @sync queue
+ * or a write to complete.
+ *
+ * In the absense of zone congestion, cond_resched() is called to yield
+ * the processor if necessary but otherwise does not sleep.
+ *
+ * The return value is 0 if the sleep is for the full timeout. Otherwise,
+ * it is the number of jiffies that were still remaining when the function
+ * returned. return_value == timeout implies the function did not sleep.
+ */
+long wait_iff_congested(struct zone *zone, int sync, long timeout)
+{
+       long ret;
+       unsigned long start = jiffies;
+       DEFINE_WAIT(wait);
+       wait_queue_head_t *wqh = &congestion_wqh[sync];
+
+       /*
+        * If there is no congestion, or heavy congestion is not being
+        * encountered in the current zone, yield if necessary instead
+        * of sleeping on the congestion queue
+        */
+       if (atomic_read(&nr_bdi_congested[sync]) == 0 ||
+                       !zone_is_reclaim_congested(zone)) {
+               cond_resched();
+
+               /* In case we scheduled, work out time remaining */
+               ret = timeout - (jiffies - start);
+               if (ret < 0)
+                       ret = 0;
+
+               goto out;
+       }
+
+       /* Sleep until uncongested or a write happens */
+       prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE);
+       ret = io_schedule_timeout(timeout);
+       finish_wait(wqh, &wait);
+
+out:
+       trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout),
+                                       jiffies_to_usecs(jiffies - start));
+
+       return ret;
+}
+EXPORT_SYMBOL(wait_iff_congested);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 6a683f819439a0da44a7dd5287178a6ddcefa86b..b13bc5e5bd7d9213bcb6ac839310c6ba0aba1590 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1907,7 +1907,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
                         preferred_zone, migratetype);
  
                 if (!page && gfp_mask & __GFP_NOFAIL)
-                       congestion_wait(BLK_RW_ASYNC, HZ/50);
+                       wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
         } while (!page && (gfp_mask & __GFP_NOFAIL));
  
         return page;
@@ -2095,7 +2095,7 @@ rebalance:
         pages_reclaimed += did_some_progress;
         if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
                 /* Wait for some write requests to complete then retry */
-               congestion_wait(BLK_RW_ASYNC, HZ/50);
+               wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
                 goto rebalance;
         }
  
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 130ad0239f524c90b78c0ae71954e9524fb96c7b..30fd658bb2897076d4fec53a5b0435d095daaec6 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -401,10 +401,8 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
         }
         if (mapping->a_ops->writepage == NULL)
                 return PAGE_ACTIVATE;
-       if (!may_write_to_queue(mapping->backing_dev_info, sc)) {
-               disable_lumpy_reclaim_mode(sc);
+       if (!may_write_to_queue(mapping->backing_dev_info, sc))
                 return PAGE_KEEP;
-       }
  
         if (clear_page_dirty_for_io(page)) {
                 int res;
@@ -681,11 +679,14 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
   * shrink_page_list() returns the number of reclaimed pages
   */
  static unsigned long shrink_page_list(struct list_head *page_list,
+                                     struct zone *zone,
                                       struct scan_control *sc)
  {
         LIST_HEAD(ret_pages);
         LIST_HEAD(free_pages);
         int pgactivate = 0;
+       unsigned long nr_dirty = 0;
+       unsigned long nr_congested = 0;
         unsigned long nr_reclaimed = 0;
  
         cond_resched();
@@ -705,6 +706,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         goto keep;
  
                 VM_BUG_ON(PageActive(page));
+               VM_BUG_ON(page_zone(page) != zone);
  
                 sc->nr_scanned++;
  
@@ -782,6 +784,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 }
  
                 if (PageDirty(page)) {
+                       nr_dirty++;
+
                         if (references == PAGEREF_RECLAIM_CLEAN)
                                 goto keep_locked;
                         if (!may_enter_fs)
@@ -792,6 +796,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                         /* Page is dirty, try to write it out here */
                         switch (pageout(page, mapping, sc)) {
                         case PAGE_KEEP:
+                               nr_congested++;
                                 goto keep_locked;
                         case PAGE_ACTIVATE:
                                 goto activate_locked;
@@ -902,6 +907,15 @@ keep_lumpy:
                 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
         }
  
+       /*
+        * Tag a zone as congested if all the dirty pages encountered were
+        * backed by a congested BDI. In this case, reclaimers should just
+        * back off and wait for congestion to clear because further reclaim
+        * will encounter the same problem
+        */
+       if (nr_dirty == nr_congested)
+               zone_set_flag(zone, ZONE_CONGESTED);
+
         free_page_list(&free_pages);
  
         list_splice(&ret_pages, page_list);
@@ -1386,12 +1400,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
  
         spin_unlock_irq(&zone->lru_lock);
  
-       nr_reclaimed = shrink_page_list(&page_list, sc);
+       nr_reclaimed = shrink_page_list(&page_list, zone, sc);
  
         /* Check if we should syncronously wait for writeback */
         if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
                 set_lumpy_reclaim_mode(priority, sc, true);
-               nr_reclaimed += shrink_page_list(&page_list, sc);
+               nr_reclaimed += shrink_page_list(&page_list, zone, sc);
         }
  
         local_irq_disable();
@@ -1982,8 +1996,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
  
                 /* Take a nap, wait for some writeback to complete */
                 if (!sc->hibernation_mode && sc->nr_scanned &&
-                   priority < DEF_PRIORITY - 2)
-                       congestion_wait(BLK_RW_ASYNC, HZ/10);
+                   priority < DEF_PRIORITY - 2) {
+                       struct zone *preferred_zone;
+
+                       first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask),
+                                                       NULL, &preferred_zone);
+                       wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10);
+               }
         }
  
  out:
@@ -2282,6 +2301,15 @@ loop_again:
                                 if (!zone_watermark_ok(zone, order,
                                             min_wmark_pages(zone), end_zone, 0))
                                         has_under_min_watermark_zone = 1;
+                       } else {
+                               /*
+                                * If a zone reaches its high watermark,
+                                * consider it to be no longer congested. It's
+                                * possible there are dirty pages backed by
+                                * congested BDIs but as pressure is relieved,
+                                * spectulatively avoid congestion waits
+                                */
+                               zone_clear_flag(zone, ZONE_CONGESTED);
                         }
  
                 }
author	Mel Gorman <mel@csn.ul.ie>
	Tue, 26 Oct 2010 21:21:45 +0000 (14:21 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 26 Oct 2010 23:52:07 +0000 (16:52 -0700)
include/linux/backing-dev.h		patch \| blob \| blame \| history
include/linux/mmzone.h		patch \| blob \| blame \| history
include/trace/events/writeback.h		patch \| blob \| blame \| history
mm/backing-dev.c		patch \| blob \| blame \| history
mm/page_alloc.c		patch \| blob \| blame \| history
mm/vmscan.c		patch \| blob \| blame \| history