[PATCH] BLOCK: Make it possible to disable the block layer [try #6]

[net-next-2.6.git] / mm / filemap.c
diff --git a/mm/filemap.c b/mm/filemap.c

index 1ed4be2a7654084743042058e6c9715aa80880b6..c4fe97f5ace0851d3023d1305d8de5c7ad2b1dd5 100644 (file)
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -9,7 +9,6 @@
   * most "normal" filesystems (but you don't /have/ to use this:
   * the NFS filesystem used to do this differently, for example)
   */
-#include <linux/config.h>
  #include <linux/module.h>
  #include <linux/slab.h>
  #include <linux/compiler.h>
@@ -120,7 +119,7 @@ void __remove_from_page_cache(struct page *page)
         radix_tree_delete(&mapping->page_tree, page->index);
         page->mapping = NULL;
         mapping->nrpages--;
-       pagecache_acct(-1);
+       __dec_zone_page_state(page, NR_FILE_PAGES);
  }
  
  void remove_from_page_cache(struct page *page)
@@ -449,7 +448,7 @@ int add_to_page_cache(struct page *page, struct address_space *mapping,
                         page->mapping = mapping;
                         page->index = offset;
                         mapping->nrpages++;
-                       pagecache_acct(1);
+                       __inc_zone_page_state(page, NR_FILE_PAGES);
                 }
                 write_unlock_irq(&mapping->tree_lock);
                 radix_tree_preload_end();
@@ -489,6 +488,12 @@ struct page *page_cache_alloc_cold(struct address_space *x)
  EXPORT_SYMBOL(page_cache_alloc_cold);
  #endif
  
+static int __sleep_on_page_lock(void *word)
+{
+       io_schedule();
+       return 0;
+}
+
  /*
   * In order to wait for pages to become available there must be
   * waitqueues associated with pages. By using a hash table of
@@ -578,13 +583,24 @@ void fastcall __lock_page(struct page *page)
  }
  EXPORT_SYMBOL(__lock_page);
  
+/*
+ * Variant of lock_page that does not require the caller to hold a reference
+ * on the page's mapping.
+ */
+void fastcall __lock_page_nosync(struct page *page)
+{
+       DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+       __wait_on_bit_lock(page_waitqueue(page), &wait, __sleep_on_page_lock,
+                                                       TASK_UNINTERRUPTIBLE);
+}
+
  /**
   * find_get_page - find and get a page reference
   * @mapping: the address_space to search
   * @offset: the page index
   *
- * A rather lightweight function, finding and getting a reference to a
- * hashed page atomically.
+ * Is there a pagecache struct page at the given (mapping, offset) tuple?
+ * If yes, increment its refcount and return it; if no, return NULL.
   */
  struct page * find_get_page(struct address_space *mapping, unsigned long offset)
  {
@@ -828,6 +844,30 @@ grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
  }
  EXPORT_SYMBOL(grab_cache_page_nowait);
  
+/*
+ * CD/DVDs are error prone. When a medium error occurs, the driver may fail
+ * a _large_ part of the i/o request. Imagine the worst scenario:
+ *
+ *      ---R__________________________________________B__________
+ *         ^ reading here                             ^ bad block(assume 4k)
+ *
+ * read(R) => miss => readahead(R...B) => media error => frustrating retries
+ * => failing the whole request => read(R) => read(R+1) =>
+ * readahead(R+1...B+1) => bang => read(R+2) => read(R+3) =>
+ * readahead(R+3...B+2) => bang => read(R+3) => read(R+4) =>
+ * readahead(R+4...B+3) => bang => read(R+4) => read(R+5) => ......
+ *
+ * It is going insane. Fix it by quickly scaling down the readahead size.
+ */
+static void shrink_readahead_size_eio(struct file *filp,
+                                       struct file_ra_state *ra)
+{
+       if (!ra->ra_pages)
+               return;
+
+       ra->ra_pages /= 4;
+}
+
  /**
   * do_generic_mapping_read - generic file read routine
   * @mapping:   address_space to be read
@@ -947,7 +987,7 @@ page_not_up_to_date:
                 /* Get exclusive access to the page ... */
                 lock_page(page);
  
-               /* Did it get unhashed before we got the lock? */
+               /* Did it get truncated before we got the lock? */
                 if (!page->mapping) {
                         unlock_page(page);
                         page_cache_release(page);
@@ -985,6 +1025,7 @@ readpage:
                                 }
                                 unlock_page(page);
                                 error = -EIO;
+                               shrink_readahead_size_eio(filp, &ra);
                                 goto readpage_error;
                         }
                         unlock_page(page);
@@ -1389,7 +1430,7 @@ retry_find:
                  */
                 if (!did_readaround) {
                         majmin = VM_FAULT_MAJOR;
-                       inc_page_state(pgmajfault);
+                       count_vm_event(PGMAJFAULT);
                 }
                 did_readaround = 1;
                 ra_pages = max_sane_readahead(file->f_ra.ra_pages);
@@ -1430,7 +1471,7 @@ outside_data_content:
          * accessible..
          */
         if (area->vm_mm == current->mm)
-               return NULL;
+               return NOPAGE_SIGBUS;
         /* Fall through to the non-read-ahead case */
  no_cached_page:
         /*
@@ -1455,12 +1496,12 @@ no_cached_page:
          */
         if (error == -ENOMEM)
                 return NOPAGE_OOM;
-       return NULL;
+       return NOPAGE_SIGBUS;
  
  page_not_uptodate:
         if (!did_readaround) {
                 majmin = VM_FAULT_MAJOR;
-               inc_page_state(pgmajfault);
+               count_vm_event(PGMAJFAULT);
         }
         lock_page(page);
  
@@ -1522,8 +1563,9 @@ page_not_uptodate:
          * Things didn't work out. Return zero to tell the
          * mm layer so, possibly freeing the page cache page first.
          */
+       shrink_readahead_size_eio(file, ra);
         page_cache_release(page);
-       return NULL;
+       return NOPAGE_SIGBUS;
  }
  EXPORT_SYMBOL(filemap_nopage);
  
@@ -1585,7 +1627,7 @@ no_cached_page:
  page_not_uptodate:
         lock_page(page);
  
-       /* Did it get unhashed while we waited for it? */
+       /* Did it get truncated while we waited for it? */
         if (!page->mapping) {
                 unlock_page(page);
                 goto err;
@@ -1978,6 +2020,7 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
                 if (unlikely(*pos + *count > inode->i_sb->s_maxbytes))
                         *count = inode->i_sb->s_maxbytes - *pos;
         } else {
+#ifdef CONFIG_BLOCK
                 loff_t isize;
                 if (bdev_read_only(I_BDEV(inode)))
                         return -EPERM;
@@ -1989,6 +2032,9 @@ inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, i
  
                 if (*pos + *count > isize)
                         *count = isize - *pos;
+#else
+               return -EPERM;
+#endif
         }
         return 0;
  }
@@ -2041,7 +2087,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
  {
         struct file *file = iocb->ki_filp;
         struct address_space * mapping = file->f_mapping;
-       struct address_space_operations *a_ops = mapping->a_ops;
+       const struct address_space_operations *a_ops = mapping->a_ops;
         struct inode    *inode = mapping->host;
         long            status = 0;
         struct page     *page;
@@ -2067,14 +2113,21 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
         do {
                 unsigned long index;
                 unsigned long offset;
-               unsigned long maxlen;
                 size_t copied;
  
                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
                 index = pos >> PAGE_CACHE_SHIFT;
                 bytes = PAGE_CACHE_SIZE - offset;
-               if (bytes > count)
-                       bytes = count;
+
+               /* Limit the size of the copy to the caller's write size */
+               bytes = min(bytes, count);
+
+               /*
+                * Limit the size of the copy to that of the current segment,
+                * because fault_in_pages_readable() doesn't know how to walk
+                * segments.
+                */
+               bytes = min(bytes, cur_iov->iov_len - iov_base);
  
                 /*
                  * Bring in the user page that we will copy from _first_.
@@ -2082,10 +2135,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                  * same page as we're writing to, without it being marked
                  * up-to-date.
                  */
-               maxlen = cur_iov->iov_len - iov_base;
-               if (maxlen > bytes)
-                       maxlen = bytes;
-               fault_in_pages_readable(buf, maxlen);
+               fault_in_pages_readable(buf, bytes);
  
                 page = __grab_cache_page(mapping,index,&cached_page,&lru_pvec);
                 if (!page) {
@@ -2093,6 +2143,12 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                         break;
                 }
  
+               if (unlikely(bytes == 0)) {
+                       status = 0;
+                       copied = 0;
+                       goto zero_length_segment;
+               }
+
                 status = a_ops->prepare_write(file, page, offset, offset+bytes);
                 if (unlikely(status)) {
                         loff_t isize = i_size_read(inode);
@@ -2122,7 +2178,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
                         page_cache_release(page);
                         continue;
                 }
-               if (likely(copied > 0)) {
+zero_length_segment:
+               if (likely(copied >= 0)) {
                         if (!status)
                                 status = copied;
  
@@ -2187,7 +2244,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
                                 unsigned long nr_segs, loff_t *ppos)
  {
         struct file *file = iocb->ki_filp;
-       struct address_space * mapping = file->f_mapping;
+       const struct address_space * mapping = file->f_mapping;
         size_t ocount;          /* original count */
         size_t count;           /* after file limit checks */
         struct inode    *inode = mapping->host;
@@ -2438,3 +2495,33 @@ generic_file_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
         }
         return retval;
  }
+
+/**
+ * try_to_release_page() - release old fs-specific metadata on a page
+ *
+ * @page: the page which the kernel is trying to free
+ * @gfp_mask: memory allocation flags (and I/O mode)
+ *
+ * The address_space is to try to release any data against the page
+ * (presumably at page->private).  If the release was successful, return `1'.
+ * Otherwise return zero.
+ *
+ * The @gfp_mask argument specifies whether I/O may be performed to release
+ * this page (__GFP_IO), and whether the call may block (__GFP_WAIT).
+ *
+ * NOTE: @gfp_mask may go away, and this function may become non-blocking.
+ */
+int try_to_release_page(struct page *page, gfp_t gfp_mask)
+{
+       struct address_space * const mapping = page->mapping;
+
+       BUG_ON(!PageLocked(page));
+       if (PageWriteback(page))
+               return 0;
+
+       if (mapping && mapping->a_ops->releasepage)
+               return mapping->a_ops->releasepage(page, gfp_mask);
+       return try_to_free_buffers(page);
+}
+
+EXPORT_SYMBOL(try_to_release_page);