]> bbs.cooldavid.org Git - net-next-2.6.git/blame - fs/buffer.c
Merge master.kernel.org:/home/rmk/linux-2.6-arm
[net-next-2.6.git] / fs / buffer.c
CommitLineData
1da177e4
LT
1/*
2 * linux/fs/buffer.c
3 *
4 * Copyright (C) 1991, 1992, 2002 Linus Torvalds
5 */
6
7/*
8 * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95
9 *
10 * Removed a lot of unnecessary code and simplified things now that
11 * the buffer cache isn't our primary cache - Andrew Tridgell 12/96
12 *
13 * Speed up hash, lru, and free list operations. Use gfp() for allocating
14 * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM
15 *
16 * Added 32k buffer block sizes - these are required older ARM systems. - RMK
17 *
18 * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de>
19 */
20
1da177e4
LT
21#include <linux/kernel.h>
22#include <linux/syscalls.h>
23#include <linux/fs.h>
24#include <linux/mm.h>
25#include <linux/percpu.h>
26#include <linux/slab.h>
16f7e0fe 27#include <linux/capability.h>
1da177e4
LT
28#include <linux/blkdev.h>
29#include <linux/file.h>
30#include <linux/quotaops.h>
31#include <linux/highmem.h>
32#include <linux/module.h>
33#include <linux/writeback.h>
34#include <linux/hash.h>
35#include <linux/suspend.h>
36#include <linux/buffer_head.h>
55e829af 37#include <linux/task_io_accounting_ops.h>
1da177e4
LT
38#include <linux/bio.h>
39#include <linux/notifier.h>
40#include <linux/cpu.h>
41#include <linux/bitops.h>
42#include <linux/mpage.h>
fb1c8f93 43#include <linux/bit_spinlock.h>
1da177e4
LT
44
45static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
1da177e4
LT
46
47#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
48
49inline void
50init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
51{
52 bh->b_end_io = handler;
53 bh->b_private = private;
54}
55
56static int sync_buffer(void *word)
57{
58 struct block_device *bd;
59 struct buffer_head *bh
60 = container_of(word, struct buffer_head, b_state);
61
62 smp_mb();
63 bd = bh->b_bdev;
64 if (bd)
65 blk_run_address_space(bd->bd_inode->i_mapping);
66 io_schedule();
67 return 0;
68}
69
fc9b52cd 70void __lock_buffer(struct buffer_head *bh)
1da177e4
LT
71{
72 wait_on_bit_lock(&bh->b_state, BH_Lock, sync_buffer,
73 TASK_UNINTERRUPTIBLE);
74}
75EXPORT_SYMBOL(__lock_buffer);
76
fc9b52cd 77void unlock_buffer(struct buffer_head *bh)
1da177e4 78{
51b07fc3 79 clear_bit_unlock(BH_Lock, &bh->b_state);
1da177e4
LT
80 smp_mb__after_clear_bit();
81 wake_up_bit(&bh->b_state, BH_Lock);
82}
83
84/*
85 * Block until a buffer comes unlocked. This doesn't stop it
86 * from becoming locked again - you have to lock it yourself
87 * if you want to preserve its state.
88 */
89void __wait_on_buffer(struct buffer_head * bh)
90{
91 wait_on_bit(&bh->b_state, BH_Lock, sync_buffer, TASK_UNINTERRUPTIBLE);
92}
93
94static void
95__clear_page_buffers(struct page *page)
96{
97 ClearPagePrivate(page);
4c21e2f2 98 set_page_private(page, 0);
1da177e4
LT
99 page_cache_release(page);
100}
101
102static void buffer_io_error(struct buffer_head *bh)
103{
104 char b[BDEVNAME_SIZE];
105
106 printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
107 bdevname(bh->b_bdev, b),
108 (unsigned long long)bh->b_blocknr);
109}
110
111/*
68671f35
DM
112 * End-of-IO handler helper function which does not touch the bh after
113 * unlocking it.
114 * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but
115 * a race there is benign: unlock_buffer() only use the bh's address for
116 * hashing after unlocking the buffer, so it doesn't actually touch the bh
117 * itself.
1da177e4 118 */
68671f35 119static void __end_buffer_read_notouch(struct buffer_head *bh, int uptodate)
1da177e4
LT
120{
121 if (uptodate) {
122 set_buffer_uptodate(bh);
123 } else {
124 /* This happens, due to failed READA attempts. */
125 clear_buffer_uptodate(bh);
126 }
127 unlock_buffer(bh);
68671f35
DM
128}
129
130/*
131 * Default synchronous end-of-IO handler.. Just mark it up-to-date and
132 * unlock the buffer. This is what ll_rw_block uses too.
133 */
134void end_buffer_read_sync(struct buffer_head *bh, int uptodate)
135{
136 __end_buffer_read_notouch(bh, uptodate);
1da177e4
LT
137 put_bh(bh);
138}
139
140void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
141{
142 char b[BDEVNAME_SIZE];
143
144 if (uptodate) {
145 set_buffer_uptodate(bh);
146 } else {
147 if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
148 buffer_io_error(bh);
149 printk(KERN_WARNING "lost page write due to "
150 "I/O error on %s\n",
151 bdevname(bh->b_bdev, b));
152 }
153 set_buffer_write_io_error(bh);
154 clear_buffer_uptodate(bh);
155 }
156 unlock_buffer(bh);
157 put_bh(bh);
158}
159
160/*
161 * Write out and wait upon all the dirty data associated with a block
162 * device via its mapping. Does not take the superblock lock.
163 */
164int sync_blockdev(struct block_device *bdev)
165{
166 int ret = 0;
167
28fd1298
OH
168 if (bdev)
169 ret = filemap_write_and_wait(bdev->bd_inode->i_mapping);
1da177e4
LT
170 return ret;
171}
172EXPORT_SYMBOL(sync_blockdev);
173
1da177e4
LT
174/*
175 * Write out and wait upon all dirty data associated with this
176 * device. Filesystem data as well as the underlying block
177 * device. Takes the superblock lock.
178 */
179int fsync_bdev(struct block_device *bdev)
180{
181 struct super_block *sb = get_super(bdev);
182 if (sb) {
183 int res = fsync_super(sb);
184 drop_super(sb);
185 return res;
186 }
187 return sync_blockdev(bdev);
188}
189
190/**
191 * freeze_bdev -- lock a filesystem and force it into a consistent state
192 * @bdev: blockdevice to lock
193 *
f73ca1b7 194 * This takes the block device bd_mount_sem to make sure no new mounts
1da177e4
LT
195 * happen on bdev until thaw_bdev() is called.
196 * If a superblock is found on this device, we take the s_umount semaphore
197 * on it to make sure nobody unmounts until the snapshot creation is done.
198 */
199struct super_block *freeze_bdev(struct block_device *bdev)
200{
201 struct super_block *sb;
202
f73ca1b7 203 down(&bdev->bd_mount_sem);
1da177e4
LT
204 sb = get_super(bdev);
205 if (sb && !(sb->s_flags & MS_RDONLY)) {
206 sb->s_frozen = SB_FREEZE_WRITE;
d59dd462 207 smp_wmb();
1da177e4 208
d25b9a1f 209 __fsync_super(sb);
1da177e4
LT
210
211 sb->s_frozen = SB_FREEZE_TRANS;
d59dd462 212 smp_wmb();
1da177e4
LT
213
214 sync_blockdev(sb->s_bdev);
215
216 if (sb->s_op->write_super_lockfs)
217 sb->s_op->write_super_lockfs(sb);
218 }
219
220 sync_blockdev(bdev);
221 return sb; /* thaw_bdev releases s->s_umount and bd_mount_sem */
222}
223EXPORT_SYMBOL(freeze_bdev);
224
225/**
226 * thaw_bdev -- unlock filesystem
227 * @bdev: blockdevice to unlock
228 * @sb: associated superblock
229 *
230 * Unlocks the filesystem and marks it writeable again after freeze_bdev().
231 */
232void thaw_bdev(struct block_device *bdev, struct super_block *sb)
233{
234 if (sb) {
235 BUG_ON(sb->s_bdev != bdev);
236
237 if (sb->s_op->unlockfs)
238 sb->s_op->unlockfs(sb);
239 sb->s_frozen = SB_UNFROZEN;
d59dd462 240 smp_wmb();
1da177e4
LT
241 wake_up(&sb->s_wait_unfrozen);
242 drop_super(sb);
243 }
244
f73ca1b7 245 up(&bdev->bd_mount_sem);
1da177e4
LT
246}
247EXPORT_SYMBOL(thaw_bdev);
248
1da177e4
LT
249/*
250 * Various filesystems appear to want __find_get_block to be non-blocking.
251 * But it's the page lock which protects the buffers. To get around this,
252 * we get exclusion from try_to_free_buffers with the blockdev mapping's
253 * private_lock.
254 *
255 * Hack idea: for the blockdev mapping, i_bufferlist_lock contention
256 * may be quite high. This code could TryLock the page, and if that
257 * succeeds, there is no need to take private_lock. (But if
258 * private_lock is contended then so is mapping->tree_lock).
259 */
260static struct buffer_head *
385fd4c5 261__find_get_block_slow(struct block_device *bdev, sector_t block)
1da177e4
LT
262{
263 struct inode *bd_inode = bdev->bd_inode;
264 struct address_space *bd_mapping = bd_inode->i_mapping;
265 struct buffer_head *ret = NULL;
266 pgoff_t index;
267 struct buffer_head *bh;
268 struct buffer_head *head;
269 struct page *page;
270 int all_mapped = 1;
271
272 index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits);
273 page = find_get_page(bd_mapping, index);
274 if (!page)
275 goto out;
276
277 spin_lock(&bd_mapping->private_lock);
278 if (!page_has_buffers(page))
279 goto out_unlock;
280 head = page_buffers(page);
281 bh = head;
282 do {
283 if (bh->b_blocknr == block) {
284 ret = bh;
285 get_bh(bh);
286 goto out_unlock;
287 }
288 if (!buffer_mapped(bh))
289 all_mapped = 0;
290 bh = bh->b_this_page;
291 } while (bh != head);
292
293 /* we might be here because some of the buffers on this page are
294 * not mapped. This is due to various races between
295 * file io on the block device and getblk. It gets dealt with
296 * elsewhere, don't buffer_error if we had some unmapped buffers
297 */
298 if (all_mapped) {
299 printk("__find_get_block_slow() failed. "
300 "block=%llu, b_blocknr=%llu\n",
205f87f6
BP
301 (unsigned long long)block,
302 (unsigned long long)bh->b_blocknr);
303 printk("b_state=0x%08lx, b_size=%zu\n",
304 bh->b_state, bh->b_size);
1da177e4
LT
305 printk("device blocksize: %d\n", 1 << bd_inode->i_blkbits);
306 }
307out_unlock:
308 spin_unlock(&bd_mapping->private_lock);
309 page_cache_release(page);
310out:
311 return ret;
312}
313
314/* If invalidate_buffers() will trash dirty buffers, it means some kind
315 of fs corruption is going on. Trashing dirty data always imply losing
316 information that was supposed to be just stored on the physical layer
317 by the user.
318
319 Thus invalidate_buffers in general usage is not allwowed to trash
320 dirty buffers. For example ioctl(FLSBLKBUF) expects dirty data to
321 be preserved. These buffers are simply skipped.
322
323 We also skip buffers which are still in use. For example this can
324 happen if a userspace program is reading the block device.
325
326 NOTE: In the case where the user removed a removable-media-disk even if
327 there's still dirty data not synced on disk (due a bug in the device driver
328 or due an error of the user), by not destroying the dirty buffers we could
329 generate corruption also on the next media inserted, thus a parameter is
330 necessary to handle this case in the most safe way possible (trying
331 to not corrupt also the new disk inserted with the data belonging to
332 the old now corrupted disk). Also for the ramdisk the natural thing
333 to do in order to release the ramdisk memory is to destroy dirty buffers.
334
335 These are two special cases. Normal usage imply the device driver
336 to issue a sync on the device (without waiting I/O completion) and
337 then an invalidate_buffers call that doesn't trash dirty buffers.
338
339 For handling cache coherency with the blkdev pagecache the 'update' case
340 is been introduced. It is needed to re-read from disk any pinned
341 buffer. NOTE: re-reading from disk is destructive so we can do it only
342 when we assume nobody is changing the buffercache under our I/O and when
343 we think the disk contains more recent information than the buffercache.
344 The update == 1 pass marks the buffers we need to update, the update == 2
345 pass does the actual I/O. */
f98393a6 346void invalidate_bdev(struct block_device *bdev)
1da177e4 347{
0e1dfc66
AM
348 struct address_space *mapping = bdev->bd_inode->i_mapping;
349
350 if (mapping->nrpages == 0)
351 return;
352
1da177e4 353 invalidate_bh_lrus();
fc0ecff6 354 invalidate_mapping_pages(mapping, 0, -1);
1da177e4
LT
355}
356
357/*
358 * Kick pdflush then try to free up some ZONE_NORMAL memory.
359 */
360static void free_more_memory(void)
361{
19770b32 362 struct zone *zone;
0e88460d 363 int nid;
1da177e4 364
687a21ce 365 wakeup_pdflush(1024);
1da177e4
LT
366 yield();
367
0e88460d 368 for_each_online_node(nid) {
19770b32
MG
369 (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS),
370 gfp_zone(GFP_NOFS), NULL,
371 &zone);
372 if (zone)
54a6eb5c
MG
373 try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
374 GFP_NOFS);
1da177e4
LT
375 }
376}
377
378/*
379 * I/O completion handler for block_read_full_page() - pages
380 * which come unlocked at the end of I/O.
381 */
382static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
383{
1da177e4 384 unsigned long flags;
a3972203 385 struct buffer_head *first;
1da177e4
LT
386 struct buffer_head *tmp;
387 struct page *page;
388 int page_uptodate = 1;
389
390 BUG_ON(!buffer_async_read(bh));
391
392 page = bh->b_page;
393 if (uptodate) {
394 set_buffer_uptodate(bh);
395 } else {
396 clear_buffer_uptodate(bh);
397 if (printk_ratelimit())
398 buffer_io_error(bh);
399 SetPageError(page);
400 }
401
402 /*
403 * Be _very_ careful from here on. Bad things can happen if
404 * two buffer heads end IO at almost the same time and both
405 * decide that the page is now completely done.
406 */
a3972203
NP
407 first = page_buffers(page);
408 local_irq_save(flags);
409 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
1da177e4
LT
410 clear_buffer_async_read(bh);
411 unlock_buffer(bh);
412 tmp = bh;
413 do {
414 if (!buffer_uptodate(tmp))
415 page_uptodate = 0;
416 if (buffer_async_read(tmp)) {
417 BUG_ON(!buffer_locked(tmp));
418 goto still_busy;
419 }
420 tmp = tmp->b_this_page;
421 } while (tmp != bh);
a3972203
NP
422 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
423 local_irq_restore(flags);
1da177e4
LT
424
425 /*
426 * If none of the buffers had errors and they are all
427 * uptodate then we can set the page uptodate.
428 */
429 if (page_uptodate && !PageError(page))
430 SetPageUptodate(page);
431 unlock_page(page);
432 return;
433
434still_busy:
a3972203
NP
435 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
436 local_irq_restore(flags);
1da177e4
LT
437 return;
438}
439
440/*
441 * Completion handler for block_write_full_page() - pages which are unlocked
442 * during I/O, and which have PageWriteback cleared upon I/O completion.
443 */
b6cd0b77 444static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
1da177e4
LT
445{
446 char b[BDEVNAME_SIZE];
1da177e4 447 unsigned long flags;
a3972203 448 struct buffer_head *first;
1da177e4
LT
449 struct buffer_head *tmp;
450 struct page *page;
451
452 BUG_ON(!buffer_async_write(bh));
453
454 page = bh->b_page;
455 if (uptodate) {
456 set_buffer_uptodate(bh);
457 } else {
458 if (printk_ratelimit()) {
459 buffer_io_error(bh);
460 printk(KERN_WARNING "lost page write due to "
461 "I/O error on %s\n",
462 bdevname(bh->b_bdev, b));
463 }
464 set_bit(AS_EIO, &page->mapping->flags);
58ff407b 465 set_buffer_write_io_error(bh);
1da177e4
LT
466 clear_buffer_uptodate(bh);
467 SetPageError(page);
468 }
469
a3972203
NP
470 first = page_buffers(page);
471 local_irq_save(flags);
472 bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
473
1da177e4
LT
474 clear_buffer_async_write(bh);
475 unlock_buffer(bh);
476 tmp = bh->b_this_page;
477 while (tmp != bh) {
478 if (buffer_async_write(tmp)) {
479 BUG_ON(!buffer_locked(tmp));
480 goto still_busy;
481 }
482 tmp = tmp->b_this_page;
483 }
a3972203
NP
484 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
485 local_irq_restore(flags);
1da177e4
LT
486 end_page_writeback(page);
487 return;
488
489still_busy:
a3972203
NP
490 bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
491 local_irq_restore(flags);
1da177e4
LT
492 return;
493}
494
495/*
496 * If a page's buffers are under async readin (end_buffer_async_read
497 * completion) then there is a possibility that another thread of
498 * control could lock one of the buffers after it has completed
499 * but while some of the other buffers have not completed. This
500 * locked buffer would confuse end_buffer_async_read() into not unlocking
501 * the page. So the absence of BH_Async_Read tells end_buffer_async_read()
502 * that this buffer is not under async I/O.
503 *
504 * The page comes unlocked when it has no locked buffer_async buffers
505 * left.
506 *
507 * PageLocked prevents anyone starting new async I/O reads any of
508 * the buffers.
509 *
510 * PageWriteback is used to prevent simultaneous writeout of the same
511 * page.
512 *
513 * PageLocked prevents anyone from starting writeback of a page which is
514 * under read I/O (PageWriteback is only ever set against a locked page).
515 */
516static void mark_buffer_async_read(struct buffer_head *bh)
517{
518 bh->b_end_io = end_buffer_async_read;
519 set_buffer_async_read(bh);
520}
521
522void mark_buffer_async_write(struct buffer_head *bh)
523{
524 bh->b_end_io = end_buffer_async_write;
525 set_buffer_async_write(bh);
526}
527EXPORT_SYMBOL(mark_buffer_async_write);
528
529
530/*
531 * fs/buffer.c contains helper functions for buffer-backed address space's
532 * fsync functions. A common requirement for buffer-based filesystems is
533 * that certain data from the backing blockdev needs to be written out for
534 * a successful fsync(). For example, ext2 indirect blocks need to be
535 * written back and waited upon before fsync() returns.
536 *
537 * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(),
538 * inode_has_buffers() and invalidate_inode_buffers() are provided for the
539 * management of a list of dependent buffers at ->i_mapping->private_list.
540 *
541 * Locking is a little subtle: try_to_free_buffers() will remove buffers
542 * from their controlling inode's queue when they are being freed. But
543 * try_to_free_buffers() will be operating against the *blockdev* mapping
544 * at the time, not against the S_ISREG file which depends on those buffers.
545 * So the locking for private_list is via the private_lock in the address_space
546 * which backs the buffers. Which is different from the address_space
547 * against which the buffers are listed. So for a particular address_space,
548 * mapping->private_lock does *not* protect mapping->private_list! In fact,
549 * mapping->private_list will always be protected by the backing blockdev's
550 * ->private_lock.
551 *
552 * Which introduces a requirement: all buffers on an address_space's
553 * ->private_list must be from the same address_space: the blockdev's.
554 *
555 * address_spaces which do not place buffers at ->private_list via these
556 * utility functions are free to use private_lock and private_list for
557 * whatever they want. The only requirement is that list_empty(private_list)
558 * be true at clear_inode() time.
559 *
560 * FIXME: clear_inode should not call invalidate_inode_buffers(). The
561 * filesystems should do that. invalidate_inode_buffers() should just go
562 * BUG_ON(!list_empty).
563 *
564 * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should
565 * take an address_space, not an inode. And it should be called
566 * mark_buffer_dirty_fsync() to clearly define why those buffers are being
567 * queued up.
568 *
569 * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the
570 * list if it is already on a list. Because if the buffer is on a list,
571 * it *must* already be on the right one. If not, the filesystem is being
572 * silly. This will save a ton of locking. But first we have to ensure
573 * that buffers are taken *off* the old inode's list when they are freed
574 * (presumably in truncate). That requires careful auditing of all
575 * filesystems (do it inside bforget()). It could also be done by bringing
576 * b_inode back.
577 */
578
579/*
580 * The buffer's backing address_space's private_lock must be held
581 */
dbacefc9 582static void __remove_assoc_queue(struct buffer_head *bh)
1da177e4
LT
583{
584 list_del_init(&bh->b_assoc_buffers);
58ff407b
JK
585 WARN_ON(!bh->b_assoc_map);
586 if (buffer_write_io_error(bh))
587 set_bit(AS_EIO, &bh->b_assoc_map->flags);
588 bh->b_assoc_map = NULL;
1da177e4
LT
589}
590
591int inode_has_buffers(struct inode *inode)
592{
593 return !list_empty(&inode->i_data.private_list);
594}
595
596/*
597 * osync is designed to support O_SYNC io. It waits synchronously for
598 * all already-submitted IO to complete, but does not queue any new
599 * writes to the disk.
600 *
601 * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as
602 * you dirty the buffers, and then use osync_inode_buffers to wait for
603 * completion. Any other dirty buffers which are not yet queued for
604 * write will not be flushed to disk by the osync.
605 */
606static int osync_buffers_list(spinlock_t *lock, struct list_head *list)
607{
608 struct buffer_head *bh;
609 struct list_head *p;
610 int err = 0;
611
612 spin_lock(lock);
613repeat:
614 list_for_each_prev(p, list) {
615 bh = BH_ENTRY(p);
616 if (buffer_locked(bh)) {
617 get_bh(bh);
618 spin_unlock(lock);
619 wait_on_buffer(bh);
620 if (!buffer_uptodate(bh))
621 err = -EIO;
622 brelse(bh);
623 spin_lock(lock);
624 goto repeat;
625 }
626 }
627 spin_unlock(lock);
628 return err;
629}
630
631/**
78a4a50a 632 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
67be2dd1 633 * @mapping: the mapping which wants those buffers written
1da177e4
LT
634 *
635 * Starts I/O against the buffers at mapping->private_list, and waits upon
636 * that I/O.
637 *
67be2dd1
MW
638 * Basically, this is a convenience function for fsync().
639 * @mapping is a file or directory which needs those buffers to be written for
640 * a successful fsync().
1da177e4
LT
641 */
642int sync_mapping_buffers(struct address_space *mapping)
643{
644 struct address_space *buffer_mapping = mapping->assoc_mapping;
645
646 if (buffer_mapping == NULL || list_empty(&mapping->private_list))
647 return 0;
648
649 return fsync_buffers_list(&buffer_mapping->private_lock,
650 &mapping->private_list);
651}
652EXPORT_SYMBOL(sync_mapping_buffers);
653
654/*
655 * Called when we've recently written block `bblock', and it is known that
656 * `bblock' was for a buffer_boundary() buffer. This means that the block at
657 * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's
658 * dirty, schedule it for IO. So that indirects merge nicely with their data.
659 */
660void write_boundary_block(struct block_device *bdev,
661 sector_t bblock, unsigned blocksize)
662{
663 struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize);
664 if (bh) {
665 if (buffer_dirty(bh))
666 ll_rw_block(WRITE, 1, &bh);
667 put_bh(bh);
668 }
669}
670
671void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
672{
673 struct address_space *mapping = inode->i_mapping;
674 struct address_space *buffer_mapping = bh->b_page->mapping;
675
676 mark_buffer_dirty(bh);
677 if (!mapping->assoc_mapping) {
678 mapping->assoc_mapping = buffer_mapping;
679 } else {
e827f923 680 BUG_ON(mapping->assoc_mapping != buffer_mapping);
1da177e4 681 }
535ee2fb 682 if (!bh->b_assoc_map) {
1da177e4
LT
683 spin_lock(&buffer_mapping->private_lock);
684 list_move_tail(&bh->b_assoc_buffers,
685 &mapping->private_list);
58ff407b 686 bh->b_assoc_map = mapping;
1da177e4
LT
687 spin_unlock(&buffer_mapping->private_lock);
688 }
689}
690EXPORT_SYMBOL(mark_buffer_dirty_inode);
691
787d2214
NP
692/*
693 * Mark the page dirty, and set it dirty in the radix tree, and mark the inode
694 * dirty.
695 *
696 * If warn is true, then emit a warning if the page is not uptodate and has
697 * not been truncated.
698 */
699static int __set_page_dirty(struct page *page,
700 struct address_space *mapping, int warn)
701{
702 if (unlikely(!mapping))
703 return !TestSetPageDirty(page);
704
705 if (TestSetPageDirty(page))
706 return 0;
707
19fd6231 708 spin_lock_irq(&mapping->tree_lock);
787d2214
NP
709 if (page->mapping) { /* Race with truncate? */
710 WARN_ON_ONCE(warn && !PageUptodate(page));
711
712 if (mapping_cap_account_dirty(mapping)) {
713 __inc_zone_page_state(page, NR_FILE_DIRTY);
c9e51e41
PZ
714 __inc_bdi_stat(mapping->backing_dev_info,
715 BDI_RECLAIMABLE);
787d2214
NP
716 task_io_account_write(PAGE_CACHE_SIZE);
717 }
718 radix_tree_tag_set(&mapping->page_tree,
719 page_index(page), PAGECACHE_TAG_DIRTY);
720 }
19fd6231 721 spin_unlock_irq(&mapping->tree_lock);
787d2214
NP
722 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
723
724 return 1;
725}
726
1da177e4
LT
727/*
728 * Add a page to the dirty page list.
729 *
730 * It is a sad fact of life that this function is called from several places
731 * deeply under spinlocking. It may not sleep.
732 *
733 * If the page has buffers, the uptodate buffers are set dirty, to preserve
734 * dirty-state coherency between the page and the buffers. It the page does
735 * not have buffers then when they are later attached they will all be set
736 * dirty.
737 *
738 * The buffers are dirtied before the page is dirtied. There's a small race
739 * window in which a writepage caller may see the page cleanness but not the
740 * buffer dirtiness. That's fine. If this code were to set the page dirty
741 * before the buffers, a concurrent writepage caller could clear the page dirty
742 * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean
743 * page on the dirty page list.
744 *
745 * We use private_lock to lock against try_to_free_buffers while using the
746 * page's buffer list. Also use this to protect against clean buffers being
747 * added to the page after it was set dirty.
748 *
749 * FIXME: may need to call ->reservepage here as well. That's rather up to the
750 * address_space though.
751 */
752int __set_page_dirty_buffers(struct page *page)
753{
787d2214 754 struct address_space *mapping = page_mapping(page);
ebf7a227
NP
755
756 if (unlikely(!mapping))
757 return !TestSetPageDirty(page);
1da177e4
LT
758
759 spin_lock(&mapping->private_lock);
760 if (page_has_buffers(page)) {
761 struct buffer_head *head = page_buffers(page);
762 struct buffer_head *bh = head;
763
764 do {
765 set_buffer_dirty(bh);
766 bh = bh->b_this_page;
767 } while (bh != head);
768 }
769 spin_unlock(&mapping->private_lock);
770
787d2214 771 return __set_page_dirty(page, mapping, 1);
1da177e4
LT
772}
773EXPORT_SYMBOL(__set_page_dirty_buffers);
774
775/*
776 * Write out and wait upon a list of buffers.
777 *
778 * We have conflicting pressures: we want to make sure that all
779 * initially dirty buffers get waited on, but that any subsequently
780 * dirtied buffers don't. After all, we don't want fsync to last
781 * forever if somebody is actively writing to the file.
782 *
783 * Do this in two main stages: first we copy dirty buffers to a
784 * temporary inode list, queueing the writes as we go. Then we clean
785 * up, waiting for those writes to complete.
786 *
787 * During this second stage, any subsequent updates to the file may end
788 * up refiling the buffer on the original inode's dirty list again, so
789 * there is a chance we will end up with a buffer queued for write but
790 * not yet completed on that list. So, as a final cleanup we go through
791 * the osync code to catch these locked, dirty buffers without requeuing
792 * any newly dirty buffers for write.
793 */
794static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
795{
796 struct buffer_head *bh;
797 struct list_head tmp;
535ee2fb 798 struct address_space *mapping;
1da177e4
LT
799 int err = 0, err2;
800
801 INIT_LIST_HEAD(&tmp);
802
803 spin_lock(lock);
804 while (!list_empty(list)) {
805 bh = BH_ENTRY(list->next);
535ee2fb 806 mapping = bh->b_assoc_map;
58ff407b 807 __remove_assoc_queue(bh);
535ee2fb
JK
808 /* Avoid race with mark_buffer_dirty_inode() which does
809 * a lockless check and we rely on seeing the dirty bit */
810 smp_mb();
1da177e4
LT
811 if (buffer_dirty(bh) || buffer_locked(bh)) {
812 list_add(&bh->b_assoc_buffers, &tmp);
535ee2fb 813 bh->b_assoc_map = mapping;
1da177e4
LT
814 if (buffer_dirty(bh)) {
815 get_bh(bh);
816 spin_unlock(lock);
817 /*
818 * Ensure any pending I/O completes so that
819 * ll_rw_block() actually writes the current
820 * contents - it is a noop if I/O is still in
821 * flight on potentially older contents.
822 */
18ce3751 823 ll_rw_block(SWRITE_SYNC, 1, &bh);
1da177e4
LT
824 brelse(bh);
825 spin_lock(lock);
826 }
827 }
828 }
829
830 while (!list_empty(&tmp)) {
831 bh = BH_ENTRY(tmp.prev);
1da177e4 832 get_bh(bh);
535ee2fb
JK
833 mapping = bh->b_assoc_map;
834 __remove_assoc_queue(bh);
835 /* Avoid race with mark_buffer_dirty_inode() which does
836 * a lockless check and we rely on seeing the dirty bit */
837 smp_mb();
838 if (buffer_dirty(bh)) {
839 list_add(&bh->b_assoc_buffers,
e3892296 840 &mapping->private_list);
535ee2fb
JK
841 bh->b_assoc_map = mapping;
842 }
1da177e4
LT
843 spin_unlock(lock);
844 wait_on_buffer(bh);
845 if (!buffer_uptodate(bh))
846 err = -EIO;
847 brelse(bh);
848 spin_lock(lock);
849 }
850
851 spin_unlock(lock);
852 err2 = osync_buffers_list(lock, list);
853 if (err)
854 return err;
855 else
856 return err2;
857}
858
859/*
860 * Invalidate any and all dirty buffers on a given inode. We are
861 * probably unmounting the fs, but that doesn't mean we have already
862 * done a sync(). Just drop the buffers from the inode list.
863 *
864 * NOTE: we take the inode's blockdev's mapping's private_lock. Which
865 * assumes that all the buffers are against the blockdev. Not true
866 * for reiserfs.
867 */
868void invalidate_inode_buffers(struct inode *inode)
869{
870 if (inode_has_buffers(inode)) {
871 struct address_space *mapping = &inode->i_data;
872 struct list_head *list = &mapping->private_list;
873 struct address_space *buffer_mapping = mapping->assoc_mapping;
874
875 spin_lock(&buffer_mapping->private_lock);
876 while (!list_empty(list))
877 __remove_assoc_queue(BH_ENTRY(list->next));
878 spin_unlock(&buffer_mapping->private_lock);
879 }
880}
881
882/*
883 * Remove any clean buffers from the inode's buffer list. This is called
884 * when we're trying to free the inode itself. Those buffers can pin it.
885 *
886 * Returns true if all buffers were removed.
887 */
888int remove_inode_buffers(struct inode *inode)
889{
890 int ret = 1;
891
892 if (inode_has_buffers(inode)) {
893 struct address_space *mapping = &inode->i_data;
894 struct list_head *list = &mapping->private_list;
895 struct address_space *buffer_mapping = mapping->assoc_mapping;
896
897 spin_lock(&buffer_mapping->private_lock);
898 while (!list_empty(list)) {
899 struct buffer_head *bh = BH_ENTRY(list->next);
900 if (buffer_dirty(bh)) {
901 ret = 0;
902 break;
903 }
904 __remove_assoc_queue(bh);
905 }
906 spin_unlock(&buffer_mapping->private_lock);
907 }
908 return ret;
909}
910
911/*
912 * Create the appropriate buffers when given a page for data area and
913 * the size of each buffer.. Use the bh->b_this_page linked list to
914 * follow the buffers created. Return NULL if unable to create more
915 * buffers.
916 *
917 * The retry flag is used to differentiate async IO (paging, swapping)
918 * which may not fail from ordinary buffer allocations.
919 */
920struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size,
921 int retry)
922{
923 struct buffer_head *bh, *head;
924 long offset;
925
926try_again:
927 head = NULL;
928 offset = PAGE_SIZE;
929 while ((offset -= size) >= 0) {
930 bh = alloc_buffer_head(GFP_NOFS);
931 if (!bh)
932 goto no_grow;
933
934 bh->b_bdev = NULL;
935 bh->b_this_page = head;
936 bh->b_blocknr = -1;
937 head = bh;
938
939 bh->b_state = 0;
940 atomic_set(&bh->b_count, 0);
fc5cd582 941 bh->b_private = NULL;
1da177e4
LT
942 bh->b_size = size;
943
944 /* Link the buffer to its page */
945 set_bh_page(bh, page, offset);
946
01ffe339 947 init_buffer(bh, NULL, NULL);
1da177e4
LT
948 }
949 return head;
950/*
951 * In case anything failed, we just free everything we got.
952 */
953no_grow:
954 if (head) {
955 do {
956 bh = head;
957 head = head->b_this_page;
958 free_buffer_head(bh);
959 } while (head);
960 }
961
962 /*
963 * Return failure for non-async IO requests. Async IO requests
964 * are not allowed to fail, so we have to wait until buffer heads
965 * become available. But we don't want tasks sleeping with
966 * partially complete buffers, so all were released above.
967 */
968 if (!retry)
969 return NULL;
970
971 /* We're _really_ low on memory. Now we just
972 * wait for old buffer heads to become free due to
973 * finishing IO. Since this is an async request and
974 * the reserve list is empty, we're sure there are
975 * async buffer heads in use.
976 */
977 free_more_memory();
978 goto try_again;
979}
980EXPORT_SYMBOL_GPL(alloc_page_buffers);
981
982static inline void
983link_dev_buffers(struct page *page, struct buffer_head *head)
984{
985 struct buffer_head *bh, *tail;
986
987 bh = head;
988 do {
989 tail = bh;
990 bh = bh->b_this_page;
991 } while (bh);
992 tail->b_this_page = head;
993 attach_page_buffers(page, head);
994}
995
996/*
997 * Initialise the state of a blockdev page's buffers.
998 */
999static void
1000init_page_buffers(struct page *page, struct block_device *bdev,
1001 sector_t block, int size)
1002{
1003 struct buffer_head *head = page_buffers(page);
1004 struct buffer_head *bh = head;
1005 int uptodate = PageUptodate(page);
1006
1007 do {
1008 if (!buffer_mapped(bh)) {
1009 init_buffer(bh, NULL, NULL);
1010 bh->b_bdev = bdev;
1011 bh->b_blocknr = block;
1012 if (uptodate)
1013 set_buffer_uptodate(bh);
1014 set_buffer_mapped(bh);
1015 }
1016 block++;
1017 bh = bh->b_this_page;
1018 } while (bh != head);
1019}
1020
1021/*
1022 * Create the page-cache page that contains the requested block.
1023 *
1024 * This is user purely for blockdev mappings.
1025 */
1026static struct page *
1027grow_dev_page(struct block_device *bdev, sector_t block,
1028 pgoff_t index, int size)
1029{
1030 struct inode *inode = bdev->bd_inode;
1031 struct page *page;
1032 struct buffer_head *bh;
1033
ea125892 1034 page = find_or_create_page(inode->i_mapping, index,
769848c0 1035 (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE);
1da177e4
LT
1036 if (!page)
1037 return NULL;
1038
e827f923 1039 BUG_ON(!PageLocked(page));
1da177e4
LT
1040
1041 if (page_has_buffers(page)) {
1042 bh = page_buffers(page);
1043 if (bh->b_size == size) {
1044 init_page_buffers(page, bdev, block, size);
1045 return page;
1046 }
1047 if (!try_to_free_buffers(page))
1048 goto failed;
1049 }
1050
1051 /*
1052 * Allocate some buffers for this page
1053 */
1054 bh = alloc_page_buffers(page, size, 0);
1055 if (!bh)
1056 goto failed;
1057
1058 /*
1059 * Link the page to the buffers and initialise them. Take the
1060 * lock to be atomic wrt __find_get_block(), which does not
1061 * run under the page lock.
1062 */
1063 spin_lock(&inode->i_mapping->private_lock);
1064 link_dev_buffers(page, bh);
1065 init_page_buffers(page, bdev, block, size);
1066 spin_unlock(&inode->i_mapping->private_lock);
1067 return page;
1068
1069failed:
1070 BUG();
1071 unlock_page(page);
1072 page_cache_release(page);
1073 return NULL;
1074}
1075
1076/*
1077 * Create buffers for the specified block device block's page. If
1078 * that page was dirty, the buffers are set dirty also.
1da177e4 1079 */
858119e1 1080static int
1da177e4
LT
1081grow_buffers(struct block_device *bdev, sector_t block, int size)
1082{
1083 struct page *page;
1084 pgoff_t index;
1085 int sizebits;
1086
1087 sizebits = -1;
1088 do {
1089 sizebits++;
1090 } while ((size << sizebits) < PAGE_SIZE);
1091
1092 index = block >> sizebits;
1da177e4 1093
e5657933
AM
1094 /*
1095 * Check for a block which wants to lie outside our maximum possible
1096 * pagecache index. (this comparison is done using sector_t types).
1097 */
1098 if (unlikely(index != block >> sizebits)) {
1099 char b[BDEVNAME_SIZE];
1100
1101 printk(KERN_ERR "%s: requested out-of-range block %llu for "
1102 "device %s\n",
8e24eea7 1103 __func__, (unsigned long long)block,
e5657933
AM
1104 bdevname(bdev, b));
1105 return -EIO;
1106 }
1107 block = index << sizebits;
1da177e4
LT
1108 /* Create a page with the proper size buffers.. */
1109 page = grow_dev_page(bdev, block, index, size);
1110 if (!page)
1111 return 0;
1112 unlock_page(page);
1113 page_cache_release(page);
1114 return 1;
1115}
1116
75c96f85 1117static struct buffer_head *
1da177e4
LT
1118__getblk_slow(struct block_device *bdev, sector_t block, int size)
1119{
1120 /* Size must be multiple of hard sectorsize */
1121 if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
1122 (size < 512 || size > PAGE_SIZE))) {
1123 printk(KERN_ERR "getblk(): invalid block size %d requested\n",
1124 size);
1125 printk(KERN_ERR "hardsect size: %d\n",
1126 bdev_hardsect_size(bdev));
1127
1128 dump_stack();
1129 return NULL;
1130 }
1131
1132 for (;;) {
1133 struct buffer_head * bh;
e5657933 1134 int ret;
1da177e4
LT
1135
1136 bh = __find_get_block(bdev, block, size);
1137 if (bh)
1138 return bh;
1139
e5657933
AM
1140 ret = grow_buffers(bdev, block, size);
1141 if (ret < 0)
1142 return NULL;
1143 if (ret == 0)
1da177e4
LT
1144 free_more_memory();
1145 }
1146}
1147
1148/*
1149 * The relationship between dirty buffers and dirty pages:
1150 *
1151 * Whenever a page has any dirty buffers, the page's dirty bit is set, and
1152 * the page is tagged dirty in its radix tree.
1153 *
1154 * At all times, the dirtiness of the buffers represents the dirtiness of
1155 * subsections of the page. If the page has buffers, the page dirty bit is
1156 * merely a hint about the true dirty state.
1157 *
1158 * When a page is set dirty in its entirety, all its buffers are marked dirty
1159 * (if the page has buffers).
1160 *
1161 * When a buffer is marked dirty, its page is dirtied, but the page's other
1162 * buffers are not.
1163 *
1164 * Also. When blockdev buffers are explicitly read with bread(), they
1165 * individually become uptodate. But their backing page remains not
1166 * uptodate - even if all of its buffers are uptodate. A subsequent
1167 * block_read_full_page() against that page will discover all the uptodate
1168 * buffers, will set the page uptodate and will perform no I/O.
1169 */
1170
1171/**
1172 * mark_buffer_dirty - mark a buffer_head as needing writeout
67be2dd1 1173 * @bh: the buffer_head to mark dirty
1da177e4
LT
1174 *
1175 * mark_buffer_dirty() will set the dirty bit against the buffer, then set its
1176 * backing page dirty, then tag the page as dirty in its address_space's radix
1177 * tree and then attach the address_space's inode to its superblock's dirty
1178 * inode list.
1179 *
1180 * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock,
1181 * mapping->tree_lock and the global inode_lock.
1182 */
fc9b52cd 1183void mark_buffer_dirty(struct buffer_head *bh)
1da177e4 1184{
787d2214 1185 WARN_ON_ONCE(!buffer_uptodate(bh));
1be62dc1
LT
1186
1187 /*
1188 * Very *carefully* optimize the it-is-already-dirty case.
1189 *
1190 * Don't let the final "is it dirty" escape to before we
1191 * perhaps modified the buffer.
1192 */
1193 if (buffer_dirty(bh)) {
1194 smp_mb();
1195 if (buffer_dirty(bh))
1196 return;
1197 }
1198
1199 if (!test_set_buffer_dirty(bh))
787d2214 1200 __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0);
1da177e4
LT
1201}
1202
1203/*
1204 * Decrement a buffer_head's reference count. If all buffers against a page
1205 * have zero reference count, are clean and unlocked, and if the page is clean
1206 * and unlocked then try_to_free_buffers() may strip the buffers from the page
1207 * in preparation for freeing it (sometimes, rarely, buffers are removed from
1208 * a page but it ends up not being freed, and buffers may later be reattached).
1209 */
1210void __brelse(struct buffer_head * buf)
1211{
1212 if (atomic_read(&buf->b_count)) {
1213 put_bh(buf);
1214 return;
1215 }
5c752ad9 1216 WARN(1, KERN_ERR "VFS: brelse: Trying to free free buffer\n");
1da177e4
LT
1217}
1218
1219/*
1220 * bforget() is like brelse(), except it discards any
1221 * potentially dirty data.
1222 */
1223void __bforget(struct buffer_head *bh)
1224{
1225 clear_buffer_dirty(bh);
535ee2fb 1226 if (bh->b_assoc_map) {
1da177e4
LT
1227 struct address_space *buffer_mapping = bh->b_page->mapping;
1228
1229 spin_lock(&buffer_mapping->private_lock);
1230 list_del_init(&bh->b_assoc_buffers);
58ff407b 1231 bh->b_assoc_map = NULL;
1da177e4
LT
1232 spin_unlock(&buffer_mapping->private_lock);
1233 }
1234 __brelse(bh);
1235}
1236
1237static struct buffer_head *__bread_slow(struct buffer_head *bh)
1238{
1239 lock_buffer(bh);
1240 if (buffer_uptodate(bh)) {
1241 unlock_buffer(bh);
1242 return bh;
1243 } else {
1244 get_bh(bh);
1245 bh->b_end_io = end_buffer_read_sync;
1246 submit_bh(READ, bh);
1247 wait_on_buffer(bh);
1248 if (buffer_uptodate(bh))
1249 return bh;
1250 }
1251 brelse(bh);
1252 return NULL;
1253}
1254
1255/*
1256 * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block().
1257 * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their
1258 * refcount elevated by one when they're in an LRU. A buffer can only appear
1259 * once in a particular CPU's LRU. A single buffer can be present in multiple
1260 * CPU's LRUs at the same time.
1261 *
1262 * This is a transparent caching front-end to sb_bread(), sb_getblk() and
1263 * sb_find_get_block().
1264 *
1265 * The LRUs themselves only need locking against invalidate_bh_lrus. We use
1266 * a local interrupt disable for that.
1267 */
1268
1269#define BH_LRU_SIZE 8
1270
1271struct bh_lru {
1272 struct buffer_head *bhs[BH_LRU_SIZE];
1273};
1274
1275static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
1276
1277#ifdef CONFIG_SMP
1278#define bh_lru_lock() local_irq_disable()
1279#define bh_lru_unlock() local_irq_enable()
1280#else
1281#define bh_lru_lock() preempt_disable()
1282#define bh_lru_unlock() preempt_enable()
1283#endif
1284
1285static inline void check_irqs_on(void)
1286{
1287#ifdef irqs_disabled
1288 BUG_ON(irqs_disabled());
1289#endif
1290}
1291
1292/*
1293 * The LRU management algorithm is dopey-but-simple. Sorry.
1294 */
1295static void bh_lru_install(struct buffer_head *bh)
1296{
1297 struct buffer_head *evictee = NULL;
1298 struct bh_lru *lru;
1299
1300 check_irqs_on();
1301 bh_lru_lock();
1302 lru = &__get_cpu_var(bh_lrus);
1303 if (lru->bhs[0] != bh) {
1304 struct buffer_head *bhs[BH_LRU_SIZE];
1305 int in;
1306 int out = 0;
1307
1308 get_bh(bh);
1309 bhs[out++] = bh;
1310 for (in = 0; in < BH_LRU_SIZE; in++) {
1311 struct buffer_head *bh2 = lru->bhs[in];
1312
1313 if (bh2 == bh) {
1314 __brelse(bh2);
1315 } else {
1316 if (out >= BH_LRU_SIZE) {
1317 BUG_ON(evictee != NULL);
1318 evictee = bh2;
1319 } else {
1320 bhs[out++] = bh2;
1321 }
1322 }
1323 }
1324 while (out < BH_LRU_SIZE)
1325 bhs[out++] = NULL;
1326 memcpy(lru->bhs, bhs, sizeof(bhs));
1327 }
1328 bh_lru_unlock();
1329
1330 if (evictee)
1331 __brelse(evictee);
1332}
1333
1334/*
1335 * Look up the bh in this cpu's LRU. If it's there, move it to the head.
1336 */
858119e1 1337static struct buffer_head *
3991d3bd 1338lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size)
1da177e4
LT
1339{
1340 struct buffer_head *ret = NULL;
1341 struct bh_lru *lru;
3991d3bd 1342 unsigned int i;
1da177e4
LT
1343
1344 check_irqs_on();
1345 bh_lru_lock();
1346 lru = &__get_cpu_var(bh_lrus);
1347 for (i = 0; i < BH_LRU_SIZE; i++) {
1348 struct buffer_head *bh = lru->bhs[i];
1349
1350 if (bh && bh->b_bdev == bdev &&
1351 bh->b_blocknr == block && bh->b_size == size) {
1352 if (i) {
1353 while (i) {
1354 lru->bhs[i] = lru->bhs[i - 1];
1355 i--;
1356 }
1357 lru->bhs[0] = bh;
1358 }
1359 get_bh(bh);
1360 ret = bh;
1361 break;
1362 }
1363 }
1364 bh_lru_unlock();
1365 return ret;
1366}
1367
1368/*
1369 * Perform a pagecache lookup for the matching buffer. If it's there, refresh
1370 * it in the LRU and mark it as accessed. If it is not present then return
1371 * NULL
1372 */
1373struct buffer_head *
3991d3bd 1374__find_get_block(struct block_device *bdev, sector_t block, unsigned size)
1da177e4
LT
1375{
1376 struct buffer_head *bh = lookup_bh_lru(bdev, block, size);
1377
1378 if (bh == NULL) {
385fd4c5 1379 bh = __find_get_block_slow(bdev, block);
1da177e4
LT
1380 if (bh)
1381 bh_lru_install(bh);
1382 }
1383 if (bh)
1384 touch_buffer(bh);
1385 return bh;
1386}
1387EXPORT_SYMBOL(__find_get_block);
1388
1389/*
1390 * __getblk will locate (and, if necessary, create) the buffer_head
1391 * which corresponds to the passed block_device, block and size. The
1392 * returned buffer has its reference count incremented.
1393 *
1394 * __getblk() cannot fail - it just keeps trying. If you pass it an
1395 * illegal block number, __getblk() will happily return a buffer_head
1396 * which represents the non-existent block. Very weird.
1397 *
1398 * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers()
1399 * attempt is failing. FIXME, perhaps?
1400 */
1401struct buffer_head *
3991d3bd 1402__getblk(struct block_device *bdev, sector_t block, unsigned size)
1da177e4
LT
1403{
1404 struct buffer_head *bh = __find_get_block(bdev, block, size);
1405
1406 might_sleep();
1407 if (bh == NULL)
1408 bh = __getblk_slow(bdev, block, size);
1409 return bh;
1410}
1411EXPORT_SYMBOL(__getblk);
1412
1413/*
1414 * Do async read-ahead on a buffer..
1415 */
3991d3bd 1416void __breadahead(struct block_device *bdev, sector_t block, unsigned size)
1da177e4
LT
1417{
1418 struct buffer_head *bh = __getblk(bdev, block, size);
a3e713b5
AM
1419 if (likely(bh)) {
1420 ll_rw_block(READA, 1, &bh);
1421 brelse(bh);
1422 }
1da177e4
LT
1423}
1424EXPORT_SYMBOL(__breadahead);
1425
1426/**
1427 * __bread() - reads a specified block and returns the bh
67be2dd1 1428 * @bdev: the block_device to read from
1da177e4
LT
1429 * @block: number of block
1430 * @size: size (in bytes) to read
1431 *
1432 * Reads a specified block, and returns buffer head that contains it.
1433 * It returns NULL if the block was unreadable.
1434 */
1435struct buffer_head *
3991d3bd 1436__bread(struct block_device *bdev, sector_t block, unsigned size)
1da177e4
LT
1437{
1438 struct buffer_head *bh = __getblk(bdev, block, size);
1439
a3e713b5 1440 if (likely(bh) && !buffer_uptodate(bh))
1da177e4
LT
1441 bh = __bread_slow(bh);
1442 return bh;
1443}
1444EXPORT_SYMBOL(__bread);
1445
1446/*
1447 * invalidate_bh_lrus() is called rarely - but not only at unmount.
1448 * This doesn't race because it runs in each cpu either in irq
1449 * or with preempt disabled.
1450 */
1451static void invalidate_bh_lru(void *arg)
1452{
1453 struct bh_lru *b = &get_cpu_var(bh_lrus);
1454 int i;
1455
1456 for (i = 0; i < BH_LRU_SIZE; i++) {
1457 brelse(b->bhs[i]);
1458 b->bhs[i] = NULL;
1459 }
1460 put_cpu_var(bh_lrus);
1461}
1462
f9a14399 1463void invalidate_bh_lrus(void)
1da177e4 1464{
15c8b6c1 1465 on_each_cpu(invalidate_bh_lru, NULL, 1);
1da177e4 1466}
9db5579b 1467EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
1da177e4
LT
1468
1469void set_bh_page(struct buffer_head *bh,
1470 struct page *page, unsigned long offset)
1471{
1472 bh->b_page = page;
e827f923 1473 BUG_ON(offset >= PAGE_SIZE);
1da177e4
LT
1474 if (PageHighMem(page))
1475 /*
1476 * This catches illegal uses and preserves the offset:
1477 */
1478 bh->b_data = (char *)(0 + offset);
1479 else
1480 bh->b_data = page_address(page) + offset;
1481}
1482EXPORT_SYMBOL(set_bh_page);
1483
1484/*
1485 * Called when truncating a buffer on a page completely.
1486 */
858119e1 1487static void discard_buffer(struct buffer_head * bh)
1da177e4
LT
1488{
1489 lock_buffer(bh);
1490 clear_buffer_dirty(bh);
1491 bh->b_bdev = NULL;
1492 clear_buffer_mapped(bh);
1493 clear_buffer_req(bh);
1494 clear_buffer_new(bh);
1495 clear_buffer_delay(bh);
33a266dd 1496 clear_buffer_unwritten(bh);
1da177e4
LT
1497 unlock_buffer(bh);
1498}
1499
1da177e4
LT
1500/**
1501 * block_invalidatepage - invalidate part of all of a buffer-backed page
1502 *
1503 * @page: the page which is affected
1504 * @offset: the index of the truncation point
1505 *
1506 * block_invalidatepage() is called when all or part of the page has become
1507 * invalidatedby a truncate operation.
1508 *
1509 * block_invalidatepage() does not have to release all buffers, but it must
1510 * ensure that no dirty buffer is left outside @offset and that no I/O
1511 * is underway against any of the blocks which are outside the truncation
1512 * point. Because the caller is about to free (and possibly reuse) those
1513 * blocks on-disk.
1514 */
2ff28e22 1515void block_invalidatepage(struct page *page, unsigned long offset)
1da177e4
LT
1516{
1517 struct buffer_head *head, *bh, *next;
1518 unsigned int curr_off = 0;
1da177e4
LT
1519
1520 BUG_ON(!PageLocked(page));
1521 if (!page_has_buffers(page))
1522 goto out;
1523
1524 head = page_buffers(page);
1525 bh = head;
1526 do {
1527 unsigned int next_off = curr_off + bh->b_size;
1528 next = bh->b_this_page;
1529
1530 /*
1531 * is this block fully invalidated?
1532 */
1533 if (offset <= curr_off)
1534 discard_buffer(bh);
1535 curr_off = next_off;
1536 bh = next;
1537 } while (bh != head);
1538
1539 /*
1540 * We release buffers only if the entire page is being invalidated.
1541 * The get_block cached value has been unconditionally invalidated,
1542 * so real IO is not possible anymore.
1543 */
1544 if (offset == 0)
2ff28e22 1545 try_to_release_page(page, 0);
1da177e4 1546out:
2ff28e22 1547 return;
1da177e4
LT
1548}
1549EXPORT_SYMBOL(block_invalidatepage);
1550
1551/*
1552 * We attach and possibly dirty the buffers atomically wrt
1553 * __set_page_dirty_buffers() via private_lock. try_to_free_buffers
1554 * is already excluded via the page lock.
1555 */
1556void create_empty_buffers(struct page *page,
1557 unsigned long blocksize, unsigned long b_state)
1558{
1559 struct buffer_head *bh, *head, *tail;
1560
1561 head = alloc_page_buffers(page, blocksize, 1);
1562 bh = head;
1563 do {
1564 bh->b_state |= b_state;
1565 tail = bh;
1566 bh = bh->b_this_page;
1567 } while (bh);
1568 tail->b_this_page = head;
1569
1570 spin_lock(&page->mapping->private_lock);
1571 if (PageUptodate(page) || PageDirty(page)) {
1572 bh = head;
1573 do {
1574 if (PageDirty(page))
1575 set_buffer_dirty(bh);
1576 if (PageUptodate(page))
1577 set_buffer_uptodate(bh);
1578 bh = bh->b_this_page;
1579 } while (bh != head);
1580 }
1581 attach_page_buffers(page, head);
1582 spin_unlock(&page->mapping->private_lock);
1583}
1584EXPORT_SYMBOL(create_empty_buffers);
1585
1586/*
1587 * We are taking a block for data and we don't want any output from any
1588 * buffer-cache aliases starting from return from that function and
1589 * until the moment when something will explicitly mark the buffer
1590 * dirty (hopefully that will not happen until we will free that block ;-)
1591 * We don't even need to mark it not-uptodate - nobody can expect
1592 * anything from a newly allocated buffer anyway. We used to used
1593 * unmap_buffer() for such invalidation, but that was wrong. We definitely
1594 * don't want to mark the alias unmapped, for example - it would confuse
1595 * anyone who might pick it with bread() afterwards...
1596 *
1597 * Also.. Note that bforget() doesn't lock the buffer. So there can
1598 * be writeout I/O going on against recently-freed buffers. We don't
1599 * wait on that I/O in bforget() - it's more efficient to wait on the I/O
1600 * only if we really need to. That happens here.
1601 */
1602void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
1603{
1604 struct buffer_head *old_bh;
1605
1606 might_sleep();
1607
385fd4c5 1608 old_bh = __find_get_block_slow(bdev, block);
1da177e4
LT
1609 if (old_bh) {
1610 clear_buffer_dirty(old_bh);
1611 wait_on_buffer(old_bh);
1612 clear_buffer_req(old_bh);
1613 __brelse(old_bh);
1614 }
1615}
1616EXPORT_SYMBOL(unmap_underlying_metadata);
1617
1618/*
1619 * NOTE! All mapped/uptodate combinations are valid:
1620 *
1621 * Mapped Uptodate Meaning
1622 *
1623 * No No "unknown" - must do get_block()
1624 * No Yes "hole" - zero-filled
1625 * Yes No "allocated" - allocated on disk, not read in
1626 * Yes Yes "valid" - allocated and up-to-date in memory.
1627 *
1628 * "Dirty" is valid only with the last case (mapped+uptodate).
1629 */
1630
1631/*
1632 * While block_write_full_page is writing back the dirty buffers under
1633 * the page lock, whoever dirtied the buffers may decide to clean them
1634 * again at any time. We handle that by only looking at the buffer
1635 * state inside lock_buffer().
1636 *
1637 * If block_write_full_page() is called for regular writeback
1638 * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1639 * locked buffer. This only can happen if someone has written the buffer
1640 * directly, with submit_bh(). At the address_space level PageWriteback
1641 * prevents this contention from occurring.
1642 */
1643static int __block_write_full_page(struct inode *inode, struct page *page,
1644 get_block_t *get_block, struct writeback_control *wbc)
1645{
1646 int err;
1647 sector_t block;
1648 sector_t last_block;
f0fbd5fc 1649 struct buffer_head *bh, *head;
b0cf2321 1650 const unsigned blocksize = 1 << inode->i_blkbits;
1da177e4
LT
1651 int nr_underway = 0;
1652
1653 BUG_ON(!PageLocked(page));
1654
1655 last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1656
1657 if (!page_has_buffers(page)) {
b0cf2321 1658 create_empty_buffers(page, blocksize,
1da177e4
LT
1659 (1 << BH_Dirty)|(1 << BH_Uptodate));
1660 }
1661
1662 /*
1663 * Be very careful. We have no exclusion from __set_page_dirty_buffers
1664 * here, and the (potentially unmapped) buffers may become dirty at
1665 * any time. If a buffer becomes dirty here after we've inspected it
1666 * then we just miss that fact, and the page stays dirty.
1667 *
1668 * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1669 * handle that here by just cleaning them.
1670 */
1671
54b21a79 1672 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1da177e4
LT
1673 head = page_buffers(page);
1674 bh = head;
1675
1676 /*
1677 * Get all the dirty buffers mapped to disk addresses and
1678 * handle any aliases from the underlying blockdev's mapping.
1679 */
1680 do {
1681 if (block > last_block) {
1682 /*
1683 * mapped buffers outside i_size will occur, because
1684 * this page can be outside i_size when there is a
1685 * truncate in progress.
1686 */
1687 /*
1688 * The buffer was zeroed by block_write_full_page()
1689 */
1690 clear_buffer_dirty(bh);
1691 set_buffer_uptodate(bh);
29a814d2
AT
1692 } else if ((!buffer_mapped(bh) || buffer_delay(bh)) &&
1693 buffer_dirty(bh)) {
b0cf2321 1694 WARN_ON(bh->b_size != blocksize);
1da177e4
LT
1695 err = get_block(inode, block, bh, 1);
1696 if (err)
1697 goto recover;
29a814d2 1698 clear_buffer_delay(bh);
1da177e4
LT
1699 if (buffer_new(bh)) {
1700 /* blockdev mappings never come here */
1701 clear_buffer_new(bh);
1702 unmap_underlying_metadata(bh->b_bdev,
1703 bh->b_blocknr);
1704 }
1705 }
1706 bh = bh->b_this_page;
1707 block++;
1708 } while (bh != head);
1709
1710 do {
1da177e4
LT
1711 if (!buffer_mapped(bh))
1712 continue;
1713 /*
1714 * If it's a fully non-blocking write attempt and we cannot
1715 * lock the buffer then redirty the page. Note that this can
1716 * potentially cause a busy-wait loop from pdflush and kswapd
1717 * activity, but those code paths have their own higher-level
1718 * throttling.
1719 */
1720 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1721 lock_buffer(bh);
ca5de404 1722 } else if (!trylock_buffer(bh)) {
1da177e4
LT
1723 redirty_page_for_writepage(wbc, page);
1724 continue;
1725 }
1726 if (test_clear_buffer_dirty(bh)) {
1727 mark_buffer_async_write(bh);
1728 } else {
1729 unlock_buffer(bh);
1730 }
1731 } while ((bh = bh->b_this_page) != head);
1732
1733 /*
1734 * The page and its buffers are protected by PageWriteback(), so we can
1735 * drop the bh refcounts early.
1736 */
1737 BUG_ON(PageWriteback(page));
1738 set_page_writeback(page);
1da177e4
LT
1739
1740 do {
1741 struct buffer_head *next = bh->b_this_page;
1742 if (buffer_async_write(bh)) {
1743 submit_bh(WRITE, bh);
1744 nr_underway++;
1745 }
1da177e4
LT
1746 bh = next;
1747 } while (bh != head);
05937baa 1748 unlock_page(page);
1da177e4
LT
1749
1750 err = 0;
1751done:
1752 if (nr_underway == 0) {
1753 /*
1754 * The page was marked dirty, but the buffers were
1755 * clean. Someone wrote them back by hand with
1756 * ll_rw_block/submit_bh. A rare case.
1757 */
1da177e4 1758 end_page_writeback(page);
3d67f2d7 1759
1da177e4
LT
1760 /*
1761 * The page and buffer_heads can be released at any time from
1762 * here on.
1763 */
1da177e4
LT
1764 }
1765 return err;
1766
1767recover:
1768 /*
1769 * ENOSPC, or some other error. We may already have added some
1770 * blocks to the file, so we need to write these out to avoid
1771 * exposing stale data.
1772 * The page is currently locked and not marked for writeback
1773 */
1774 bh = head;
1775 /* Recovery: lock and submit the mapped buffers */
1776 do {
29a814d2
AT
1777 if (buffer_mapped(bh) && buffer_dirty(bh) &&
1778 !buffer_delay(bh)) {
1da177e4
LT
1779 lock_buffer(bh);
1780 mark_buffer_async_write(bh);
1781 } else {
1782 /*
1783 * The buffer may have been set dirty during
1784 * attachment to a dirty page.
1785 */
1786 clear_buffer_dirty(bh);
1787 }
1788 } while ((bh = bh->b_this_page) != head);
1789 SetPageError(page);
1790 BUG_ON(PageWriteback(page));
7e4c3690 1791 mapping_set_error(page->mapping, err);
1da177e4 1792 set_page_writeback(page);
1da177e4
LT
1793 do {
1794 struct buffer_head *next = bh->b_this_page;
1795 if (buffer_async_write(bh)) {
1796 clear_buffer_dirty(bh);
1797 submit_bh(WRITE, bh);
1798 nr_underway++;
1799 }
1da177e4
LT
1800 bh = next;
1801 } while (bh != head);
ffda9d30 1802 unlock_page(page);
1da177e4
LT
1803 goto done;
1804}
1805
afddba49
NP
1806/*
1807 * If a page has any new buffers, zero them out here, and mark them uptodate
1808 * and dirty so they'll be written out (in order to prevent uninitialised
1809 * block data from leaking). And clear the new bit.
1810 */
1811void page_zero_new_buffers(struct page *page, unsigned from, unsigned to)
1812{
1813 unsigned int block_start, block_end;
1814 struct buffer_head *head, *bh;
1815
1816 BUG_ON(!PageLocked(page));
1817 if (!page_has_buffers(page))
1818 return;
1819
1820 bh = head = page_buffers(page);
1821 block_start = 0;
1822 do {
1823 block_end = block_start + bh->b_size;
1824
1825 if (buffer_new(bh)) {
1826 if (block_end > from && block_start < to) {
1827 if (!PageUptodate(page)) {
1828 unsigned start, size;
1829
1830 start = max(from, block_start);
1831 size = min(to, block_end) - start;
1832
eebd2aa3 1833 zero_user(page, start, size);
afddba49
NP
1834 set_buffer_uptodate(bh);
1835 }
1836
1837 clear_buffer_new(bh);
1838 mark_buffer_dirty(bh);
1839 }
1840 }
1841
1842 block_start = block_end;
1843 bh = bh->b_this_page;
1844 } while (bh != head);
1845}
1846EXPORT_SYMBOL(page_zero_new_buffers);
1847
1da177e4
LT
1848static int __block_prepare_write(struct inode *inode, struct page *page,
1849 unsigned from, unsigned to, get_block_t *get_block)
1850{
1851 unsigned block_start, block_end;
1852 sector_t block;
1853 int err = 0;
1854 unsigned blocksize, bbits;
1855 struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;
1856
1857 BUG_ON(!PageLocked(page));
1858 BUG_ON(from > PAGE_CACHE_SIZE);
1859 BUG_ON(to > PAGE_CACHE_SIZE);
1860 BUG_ON(from > to);
1861
1862 blocksize = 1 << inode->i_blkbits;
1863 if (!page_has_buffers(page))
1864 create_empty_buffers(page, blocksize, 0);
1865 head = page_buffers(page);
1866
1867 bbits = inode->i_blkbits;
1868 block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
1869
1870 for(bh = head, block_start = 0; bh != head || !block_start;
1871 block++, block_start=block_end, bh = bh->b_this_page) {
1872 block_end = block_start + blocksize;
1873 if (block_end <= from || block_start >= to) {
1874 if (PageUptodate(page)) {
1875 if (!buffer_uptodate(bh))
1876 set_buffer_uptodate(bh);
1877 }
1878 continue;
1879 }
1880 if (buffer_new(bh))
1881 clear_buffer_new(bh);
1882 if (!buffer_mapped(bh)) {
b0cf2321 1883 WARN_ON(bh->b_size != blocksize);
1da177e4
LT
1884 err = get_block(inode, block, bh, 1);
1885 if (err)
f3ddbdc6 1886 break;
1da177e4 1887 if (buffer_new(bh)) {
1da177e4
LT
1888 unmap_underlying_metadata(bh->b_bdev,
1889 bh->b_blocknr);
1890 if (PageUptodate(page)) {
637aff46 1891 clear_buffer_new(bh);
1da177e4 1892 set_buffer_uptodate(bh);
637aff46 1893 mark_buffer_dirty(bh);
1da177e4
LT
1894 continue;
1895 }
eebd2aa3
CL
1896 if (block_end > to || block_start < from)
1897 zero_user_segments(page,
1898 to, block_end,
1899 block_start, from);
1da177e4
LT
1900 continue;
1901 }
1902 }
1903 if (PageUptodate(page)) {
1904 if (!buffer_uptodate(bh))
1905 set_buffer_uptodate(bh);
1906 continue;
1907 }
1908 if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
33a266dd 1909 !buffer_unwritten(bh) &&
1da177e4
LT
1910 (block_start < from || block_end > to)) {
1911 ll_rw_block(READ, 1, &bh);
1912 *wait_bh++=bh;
1913 }
1914 }
1915 /*
1916 * If we issued read requests - let them complete.
1917 */
1918 while(wait_bh > wait) {
1919 wait_on_buffer(*--wait_bh);
1920 if (!buffer_uptodate(*wait_bh))
f3ddbdc6 1921 err = -EIO;
1da177e4 1922 }
afddba49
NP
1923 if (unlikely(err))
1924 page_zero_new_buffers(page, from, to);
1da177e4
LT
1925 return err;
1926}
1927
1928static int __block_commit_write(struct inode *inode, struct page *page,
1929 unsigned from, unsigned to)
1930{
1931 unsigned block_start, block_end;
1932 int partial = 0;
1933 unsigned blocksize;
1934 struct buffer_head *bh, *head;
1935
1936 blocksize = 1 << inode->i_blkbits;
1937
1938 for(bh = head = page_buffers(page), block_start = 0;
1939 bh != head || !block_start;
1940 block_start=block_end, bh = bh->b_this_page) {
1941 block_end = block_start + blocksize;
1942 if (block_end <= from || block_start >= to) {
1943 if (!buffer_uptodate(bh))
1944 partial = 1;
1945 } else {
1946 set_buffer_uptodate(bh);
1947 mark_buffer_dirty(bh);
1948 }
afddba49 1949 clear_buffer_new(bh);
1da177e4
LT
1950 }
1951
1952 /*
1953 * If this is a partial write which happened to make all buffers
1954 * uptodate then we can optimize away a bogus readpage() for
1955 * the next read(). Here we 'discover' whether the page went
1956 * uptodate as a result of this (potentially partial) write.
1957 */
1958 if (!partial)
1959 SetPageUptodate(page);
1960 return 0;
1961}
1962
afddba49
NP
1963/*
1964 * block_write_begin takes care of the basic task of block allocation and
1965 * bringing partial write blocks uptodate first.
1966 *
1967 * If *pagep is not NULL, then block_write_begin uses the locked page
1968 * at *pagep rather than allocating its own. In this case, the page will
1969 * not be unlocked or deallocated on failure.
1970 */
1971int block_write_begin(struct file *file, struct address_space *mapping,
1972 loff_t pos, unsigned len, unsigned flags,
1973 struct page **pagep, void **fsdata,
1974 get_block_t *get_block)
1975{
1976 struct inode *inode = mapping->host;
1977 int status = 0;
1978 struct page *page;
1979 pgoff_t index;
1980 unsigned start, end;
1981 int ownpage = 0;
1982
1983 index = pos >> PAGE_CACHE_SHIFT;
1984 start = pos & (PAGE_CACHE_SIZE - 1);
1985 end = start + len;
1986
1987 page = *pagep;
1988 if (page == NULL) {
1989 ownpage = 1;
1990 page = __grab_cache_page(mapping, index);
1991 if (!page) {
1992 status = -ENOMEM;
1993 goto out;
1994 }
1995 *pagep = page;
1996 } else
1997 BUG_ON(!PageLocked(page));
1998
1999 status = __block_prepare_write(inode, page, start, end, get_block);
2000 if (unlikely(status)) {
2001 ClearPageUptodate(page);
2002
2003 if (ownpage) {
2004 unlock_page(page);
2005 page_cache_release(page);
2006 *pagep = NULL;
2007
2008 /*
2009 * prepare_write() may have instantiated a few blocks
2010 * outside i_size. Trim these off again. Don't need
2011 * i_size_read because we hold i_mutex.
2012 */
2013 if (pos + len > inode->i_size)
2014 vmtruncate(inode, inode->i_size);
2015 }
2016 goto out;
2017 }
2018
2019out:
2020 return status;
2021}
2022EXPORT_SYMBOL(block_write_begin);
2023
2024int block_write_end(struct file *file, struct address_space *mapping,
2025 loff_t pos, unsigned len, unsigned copied,
2026 struct page *page, void *fsdata)
2027{
2028 struct inode *inode = mapping->host;
2029 unsigned start;
2030
2031 start = pos & (PAGE_CACHE_SIZE - 1);
2032
2033 if (unlikely(copied < len)) {
2034 /*
2035 * The buffers that were written will now be uptodate, so we
2036 * don't have to worry about a readpage reading them and
2037 * overwriting a partial write. However if we have encountered
2038 * a short write and only partially written into a buffer, it
2039 * will not be marked uptodate, so a readpage might come in and
2040 * destroy our partial write.
2041 *
2042 * Do the simplest thing, and just treat any short write to a
2043 * non uptodate page as a zero-length write, and force the
2044 * caller to redo the whole thing.
2045 */
2046 if (!PageUptodate(page))
2047 copied = 0;
2048
2049 page_zero_new_buffers(page, start+copied, start+len);
2050 }
2051 flush_dcache_page(page);
2052
2053 /* This could be a short (even 0-length) commit */
2054 __block_commit_write(inode, page, start, start+copied);
2055
2056 return copied;
2057}
2058EXPORT_SYMBOL(block_write_end);
2059
2060int generic_write_end(struct file *file, struct address_space *mapping,
2061 loff_t pos, unsigned len, unsigned copied,
2062 struct page *page, void *fsdata)
2063{
2064 struct inode *inode = mapping->host;
c7d206b3 2065 int i_size_changed = 0;
afddba49
NP
2066
2067 copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
2068
2069 /*
2070 * No need to use i_size_read() here, the i_size
2071 * cannot change under us because we hold i_mutex.
2072 *
2073 * But it's important to update i_size while still holding page lock:
2074 * page writeout could otherwise come in and zero beyond i_size.
2075 */
2076 if (pos+copied > inode->i_size) {
2077 i_size_write(inode, pos+copied);
c7d206b3 2078 i_size_changed = 1;
afddba49
NP
2079 }
2080
2081 unlock_page(page);
2082 page_cache_release(page);
2083
c7d206b3
JK
2084 /*
2085 * Don't mark the inode dirty under page lock. First, it unnecessarily
2086 * makes the holding time of page lock longer. Second, it forces lock
2087 * ordering of page lock and transaction start for journaling
2088 * filesystems.
2089 */
2090 if (i_size_changed)
2091 mark_inode_dirty(inode);
2092
afddba49
NP
2093 return copied;
2094}
2095EXPORT_SYMBOL(generic_write_end);
2096
8ab22b9a
HH
2097/*
2098 * block_is_partially_uptodate checks whether buffers within a page are
2099 * uptodate or not.
2100 *
2101 * Returns true if all buffers which correspond to a file portion
2102 * we want to read are uptodate.
2103 */
2104int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
2105 unsigned long from)
2106{
2107 struct inode *inode = page->mapping->host;
2108 unsigned block_start, block_end, blocksize;
2109 unsigned to;
2110 struct buffer_head *bh, *head;
2111 int ret = 1;
2112
2113 if (!page_has_buffers(page))
2114 return 0;
2115
2116 blocksize = 1 << inode->i_blkbits;
2117 to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
2118 to = from + to;
2119 if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
2120 return 0;
2121
2122 head = page_buffers(page);
2123 bh = head;
2124 block_start = 0;
2125 do {
2126 block_end = block_start + blocksize;
2127 if (block_end > from && block_start < to) {
2128 if (!buffer_uptodate(bh)) {
2129 ret = 0;
2130 break;
2131 }
2132 if (block_end >= to)
2133 break;
2134 }
2135 block_start = block_end;
2136 bh = bh->b_this_page;
2137 } while (bh != head);
2138
2139 return ret;
2140}
2141EXPORT_SYMBOL(block_is_partially_uptodate);
2142
1da177e4
LT
2143/*
2144 * Generic "read page" function for block devices that have the normal
2145 * get_block functionality. This is most of the block device filesystems.
2146 * Reads the page asynchronously --- the unlock_buffer() and
2147 * set/clear_buffer_uptodate() functions propagate buffer state into the
2148 * page struct once IO has completed.
2149 */
2150int block_read_full_page(struct page *page, get_block_t *get_block)
2151{
2152 struct inode *inode = page->mapping->host;
2153 sector_t iblock, lblock;
2154 struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
2155 unsigned int blocksize;
2156 int nr, i;
2157 int fully_mapped = 1;
2158
cd7619d6 2159 BUG_ON(!PageLocked(page));
1da177e4
LT
2160 blocksize = 1 << inode->i_blkbits;
2161 if (!page_has_buffers(page))
2162 create_empty_buffers(page, blocksize, 0);
2163 head = page_buffers(page);
2164
2165 iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
2166 lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
2167 bh = head;
2168 nr = 0;
2169 i = 0;
2170
2171 do {
2172 if (buffer_uptodate(bh))
2173 continue;
2174
2175 if (!buffer_mapped(bh)) {
c64610ba
AM
2176 int err = 0;
2177
1da177e4
LT
2178 fully_mapped = 0;
2179 if (iblock < lblock) {
b0cf2321 2180 WARN_ON(bh->b_size != blocksize);
c64610ba
AM
2181 err = get_block(inode, iblock, bh, 0);
2182 if (err)
1da177e4
LT
2183 SetPageError(page);
2184 }
2185 if (!buffer_mapped(bh)) {
eebd2aa3 2186 zero_user(page, i * blocksize, blocksize);
c64610ba
AM
2187 if (!err)
2188 set_buffer_uptodate(bh);
1da177e4
LT
2189 continue;
2190 }
2191 /*
2192 * get_block() might have updated the buffer
2193 * synchronously
2194 */
2195 if (buffer_uptodate(bh))
2196 continue;
2197 }
2198 arr[nr++] = bh;
2199 } while (i++, iblock++, (bh = bh->b_this_page) != head);
2200
2201 if (fully_mapped)
2202 SetPageMappedToDisk(page);
2203
2204 if (!nr) {
2205 /*
2206 * All buffers are uptodate - we can set the page uptodate
2207 * as well. But not if get_block() returned an error.
2208 */
2209 if (!PageError(page))
2210 SetPageUptodate(page);
2211 unlock_page(page);
2212 return 0;
2213 }
2214
2215 /* Stage two: lock the buffers */
2216 for (i = 0; i < nr; i++) {
2217 bh = arr[i];
2218 lock_buffer(bh);
2219 mark_buffer_async_read(bh);
2220 }
2221
2222 /*
2223 * Stage 3: start the IO. Check for uptodateness
2224 * inside the buffer lock in case another process reading
2225 * the underlying blockdev brought it uptodate (the sct fix).
2226 */
2227 for (i = 0; i < nr; i++) {
2228 bh = arr[i];
2229 if (buffer_uptodate(bh))
2230 end_buffer_async_read(bh, 1);
2231 else
2232 submit_bh(READ, bh);
2233 }
2234 return 0;
2235}
2236
2237/* utility function for filesystems that need to do work on expanding
89e10787 2238 * truncates. Uses filesystem pagecache writes to allow the filesystem to
1da177e4
LT
2239 * deal with the hole.
2240 */
89e10787 2241int generic_cont_expand_simple(struct inode *inode, loff_t size)
1da177e4
LT
2242{
2243 struct address_space *mapping = inode->i_mapping;
2244 struct page *page;
89e10787 2245 void *fsdata;
05eb0b51 2246 unsigned long limit;
1da177e4
LT
2247 int err;
2248
2249 err = -EFBIG;
2250 limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
2251 if (limit != RLIM_INFINITY && size > (loff_t)limit) {
2252 send_sig(SIGXFSZ, current, 0);
2253 goto out;
2254 }
2255 if (size > inode->i_sb->s_maxbytes)
2256 goto out;
2257
89e10787
NP
2258 err = pagecache_write_begin(NULL, mapping, size, 0,
2259 AOP_FLAG_UNINTERRUPTIBLE|AOP_FLAG_CONT_EXPAND,
2260 &page, &fsdata);
2261 if (err)
05eb0b51 2262 goto out;
05eb0b51 2263
89e10787
NP
2264 err = pagecache_write_end(NULL, mapping, size, 0, 0, page, fsdata);
2265 BUG_ON(err > 0);
05eb0b51 2266
1da177e4
LT
2267out:
2268 return err;
2269}
2270
f1e3af72
AB
2271static int cont_expand_zero(struct file *file, struct address_space *mapping,
2272 loff_t pos, loff_t *bytes)
1da177e4 2273{
1da177e4 2274 struct inode *inode = mapping->host;
1da177e4 2275 unsigned blocksize = 1 << inode->i_blkbits;
89e10787
NP
2276 struct page *page;
2277 void *fsdata;
2278 pgoff_t index, curidx;
2279 loff_t curpos;
2280 unsigned zerofrom, offset, len;
2281 int err = 0;
1da177e4 2282
89e10787
NP
2283 index = pos >> PAGE_CACHE_SHIFT;
2284 offset = pos & ~PAGE_CACHE_MASK;
2285
2286 while (index > (curidx = (curpos = *bytes)>>PAGE_CACHE_SHIFT)) {
2287 zerofrom = curpos & ~PAGE_CACHE_MASK;
1da177e4
LT
2288 if (zerofrom & (blocksize-1)) {
2289 *bytes |= (blocksize-1);
2290 (*bytes)++;
2291 }
89e10787 2292 len = PAGE_CACHE_SIZE - zerofrom;
1da177e4 2293
89e10787
NP
2294 err = pagecache_write_begin(file, mapping, curpos, len,
2295 AOP_FLAG_UNINTERRUPTIBLE,
2296 &page, &fsdata);
2297 if (err)
2298 goto out;
eebd2aa3 2299 zero_user(page, zerofrom, len);
89e10787
NP
2300 err = pagecache_write_end(file, mapping, curpos, len, len,
2301 page, fsdata);
2302 if (err < 0)
2303 goto out;
2304 BUG_ON(err != len);
2305 err = 0;
061e9746
OH
2306
2307 balance_dirty_pages_ratelimited(mapping);
89e10787 2308 }
1da177e4 2309
89e10787
NP
2310 /* page covers the boundary, find the boundary offset */
2311 if (index == curidx) {
2312 zerofrom = curpos & ~PAGE_CACHE_MASK;
1da177e4 2313 /* if we will expand the thing last block will be filled */
89e10787
NP
2314 if (offset <= zerofrom) {
2315 goto out;
2316 }
2317 if (zerofrom & (blocksize-1)) {
1da177e4
LT
2318 *bytes |= (blocksize-1);
2319 (*bytes)++;
2320 }
89e10787 2321 len = offset - zerofrom;
1da177e4 2322
89e10787
NP
2323 err = pagecache_write_begin(file, mapping, curpos, len,
2324 AOP_FLAG_UNINTERRUPTIBLE,
2325 &page, &fsdata);
2326 if (err)
2327 goto out;
eebd2aa3 2328 zero_user(page, zerofrom, len);
89e10787
NP
2329 err = pagecache_write_end(file, mapping, curpos, len, len,
2330 page, fsdata);
2331 if (err < 0)
2332 goto out;
2333 BUG_ON(err != len);
2334 err = 0;
1da177e4 2335 }
89e10787
NP
2336out:
2337 return err;
2338}
2339
2340/*
2341 * For moronic filesystems that do not allow holes in file.
2342 * We may have to extend the file.
2343 */
2344int cont_write_begin(struct file *file, struct address_space *mapping,
2345 loff_t pos, unsigned len, unsigned flags,
2346 struct page **pagep, void **fsdata,
2347 get_block_t *get_block, loff_t *bytes)
2348{
2349 struct inode *inode = mapping->host;
2350 unsigned blocksize = 1 << inode->i_blkbits;
2351 unsigned zerofrom;
2352 int err;
2353
2354 err = cont_expand_zero(file, mapping, pos, bytes);
2355 if (err)
2356 goto out;
2357
2358 zerofrom = *bytes & ~PAGE_CACHE_MASK;
2359 if (pos+len > *bytes && zerofrom & (blocksize-1)) {
2360 *bytes |= (blocksize-1);
2361 (*bytes)++;
1da177e4 2362 }
1da177e4 2363
89e10787
NP
2364 *pagep = NULL;
2365 err = block_write_begin(file, mapping, pos, len,
2366 flags, pagep, fsdata, get_block);
1da177e4 2367out:
89e10787 2368 return err;
1da177e4
LT
2369}
2370
2371int block_prepare_write(struct page *page, unsigned from, unsigned to,
2372 get_block_t *get_block)
2373{
2374 struct inode *inode = page->mapping->host;
2375 int err = __block_prepare_write(inode, page, from, to, get_block);
2376 if (err)
2377 ClearPageUptodate(page);
2378 return err;
2379}
2380
2381int block_commit_write(struct page *page, unsigned from, unsigned to)
2382{
2383 struct inode *inode = page->mapping->host;
2384 __block_commit_write(inode,page,from,to);
2385 return 0;
2386}
2387
54171690
DC
2388/*
2389 * block_page_mkwrite() is not allowed to change the file size as it gets
2390 * called from a page fault handler when a page is first dirtied. Hence we must
2391 * be careful to check for EOF conditions here. We set the page up correctly
2392 * for a written page which means we get ENOSPC checking when writing into
2393 * holes and correct delalloc and unwritten extent mapping on filesystems that
2394 * support these features.
2395 *
2396 * We are not allowed to take the i_mutex here so we have to play games to
2397 * protect against truncate races as the page could now be beyond EOF. Because
2398 * vmtruncate() writes the inode size before removing pages, once we have the
2399 * page lock we can determine safely if the page is beyond EOF. If it is not
2400 * beyond EOF, then the page is guaranteed safe against truncation until we
2401 * unlock the page.
2402 */
2403int
2404block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
2405 get_block_t get_block)
2406{
2407 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
2408 unsigned long end;
2409 loff_t size;
2410 int ret = -EINVAL;
2411
2412 lock_page(page);
2413 size = i_size_read(inode);
2414 if ((page->mapping != inode->i_mapping) ||
18336338 2415 (page_offset(page) > size)) {
54171690
DC
2416 /* page got truncated out from underneath us */
2417 goto out_unlock;
2418 }
2419
2420 /* page is wholly or partially inside EOF */
2421 if (((page->index + 1) << PAGE_CACHE_SHIFT) > size)
2422 end = size & ~PAGE_CACHE_MASK;
2423 else
2424 end = PAGE_CACHE_SIZE;
2425
2426 ret = block_prepare_write(page, 0, end, get_block);
2427 if (!ret)
2428 ret = block_commit_write(page, 0, end);
2429
2430out_unlock:
2431 unlock_page(page);
2432 return ret;
2433}
1da177e4
LT
2434
2435/*
03158cd7 2436 * nobh_write_begin()'s prereads are special: the buffer_heads are freed
1da177e4
LT
2437 * immediately, while under the page lock. So it needs a special end_io
2438 * handler which does not touch the bh after unlocking it.
1da177e4
LT
2439 */
2440static void end_buffer_read_nobh(struct buffer_head *bh, int uptodate)
2441{
68671f35 2442 __end_buffer_read_notouch(bh, uptodate);
1da177e4
LT
2443}
2444
03158cd7
NP
2445/*
2446 * Attach the singly-linked list of buffers created by nobh_write_begin, to
2447 * the page (converting it to circular linked list and taking care of page
2448 * dirty races).
2449 */
2450static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
2451{
2452 struct buffer_head *bh;
2453
2454 BUG_ON(!PageLocked(page));
2455
2456 spin_lock(&page->mapping->private_lock);
2457 bh = head;
2458 do {
2459 if (PageDirty(page))
2460 set_buffer_dirty(bh);
2461 if (!bh->b_this_page)
2462 bh->b_this_page = head;
2463 bh = bh->b_this_page;
2464 } while (bh != head);
2465 attach_page_buffers(page, head);
2466 spin_unlock(&page->mapping->private_lock);
2467}
2468
1da177e4
LT
2469/*
2470 * On entry, the page is fully not uptodate.
2471 * On exit the page is fully uptodate in the areas outside (from,to)
2472 */
03158cd7
NP
2473int nobh_write_begin(struct file *file, struct address_space *mapping,
2474 loff_t pos, unsigned len, unsigned flags,
2475 struct page **pagep, void **fsdata,
1da177e4
LT
2476 get_block_t *get_block)
2477{
03158cd7 2478 struct inode *inode = mapping->host;
1da177e4
LT
2479 const unsigned blkbits = inode->i_blkbits;
2480 const unsigned blocksize = 1 << blkbits;
a4b0672d 2481 struct buffer_head *head, *bh;
03158cd7
NP
2482 struct page *page;
2483 pgoff_t index;
2484 unsigned from, to;
1da177e4 2485 unsigned block_in_page;
a4b0672d 2486 unsigned block_start, block_end;
1da177e4 2487 sector_t block_in_file;
1da177e4 2488 int nr_reads = 0;
1da177e4
LT
2489 int ret = 0;
2490 int is_mapped_to_disk = 1;
1da177e4 2491
03158cd7
NP
2492 index = pos >> PAGE_CACHE_SHIFT;
2493 from = pos & (PAGE_CACHE_SIZE - 1);
2494 to = from + len;
2495
2496 page = __grab_cache_page(mapping, index);
2497 if (!page)
2498 return -ENOMEM;
2499 *pagep = page;
2500 *fsdata = NULL;
2501
2502 if (page_has_buffers(page)) {
2503 unlock_page(page);
2504 page_cache_release(page);
2505 *pagep = NULL;
2506 return block_write_begin(file, mapping, pos, len, flags, pagep,
2507 fsdata, get_block);
2508 }
a4b0672d 2509
1da177e4
LT
2510 if (PageMappedToDisk(page))
2511 return 0;
2512
a4b0672d
NP
2513 /*
2514 * Allocate buffers so that we can keep track of state, and potentially
2515 * attach them to the page if an error occurs. In the common case of
2516 * no error, they will just be freed again without ever being attached
2517 * to the page (which is all OK, because we're under the page lock).
2518 *
2519 * Be careful: the buffer linked list is a NULL terminated one, rather
2520 * than the circular one we're used to.
2521 */
2522 head = alloc_page_buffers(page, blocksize, 0);
03158cd7
NP
2523 if (!head) {
2524 ret = -ENOMEM;
2525 goto out_release;
2526 }
a4b0672d 2527
1da177e4 2528 block_in_file = (sector_t)page->index << (PAGE_CACHE_SHIFT - blkbits);
1da177e4
LT
2529
2530 /*
2531 * We loop across all blocks in the page, whether or not they are
2532 * part of the affected region. This is so we can discover if the
2533 * page is fully mapped-to-disk.
2534 */
a4b0672d 2535 for (block_start = 0, block_in_page = 0, bh = head;
1da177e4 2536 block_start < PAGE_CACHE_SIZE;
a4b0672d 2537 block_in_page++, block_start += blocksize, bh = bh->b_this_page) {
1da177e4
LT
2538 int create;
2539
a4b0672d
NP
2540 block_end = block_start + blocksize;
2541 bh->b_state = 0;
1da177e4
LT
2542 create = 1;
2543 if (block_start >= to)
2544 create = 0;
2545 ret = get_block(inode, block_in_file + block_in_page,
a4b0672d 2546 bh, create);
1da177e4
LT
2547 if (ret)
2548 goto failed;
a4b0672d 2549 if (!buffer_mapped(bh))
1da177e4 2550 is_mapped_to_disk = 0;
a4b0672d
NP
2551 if (buffer_new(bh))
2552 unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
2553 if (PageUptodate(page)) {
2554 set_buffer_uptodate(bh);
1da177e4 2555 continue;
a4b0672d
NP
2556 }
2557 if (buffer_new(bh) || !buffer_mapped(bh)) {
eebd2aa3
CL
2558 zero_user_segments(page, block_start, from,
2559 to, block_end);
1da177e4
LT
2560 continue;
2561 }
a4b0672d 2562 if (buffer_uptodate(bh))
1da177e4
LT
2563 continue; /* reiserfs does this */
2564 if (block_start < from || block_end > to) {
a4b0672d
NP
2565 lock_buffer(bh);
2566 bh->b_end_io = end_buffer_read_nobh;
2567 submit_bh(READ, bh);
2568 nr_reads++;
1da177e4
LT
2569 }
2570 }
2571
2572 if (nr_reads) {
1da177e4
LT
2573 /*
2574 * The page is locked, so these buffers are protected from
2575 * any VM or truncate activity. Hence we don't need to care
2576 * for the buffer_head refcounts.
2577 */
a4b0672d 2578 for (bh = head; bh; bh = bh->b_this_page) {
1da177e4
LT
2579 wait_on_buffer(bh);
2580 if (!buffer_uptodate(bh))
2581 ret = -EIO;
1da177e4
LT
2582 }
2583 if (ret)
2584 goto failed;
2585 }
2586
2587 if (is_mapped_to_disk)
2588 SetPageMappedToDisk(page);
1da177e4 2589
03158cd7 2590 *fsdata = head; /* to be released by nobh_write_end */
a4b0672d 2591
1da177e4
LT
2592 return 0;
2593
2594failed:
03158cd7 2595 BUG_ON(!ret);
1da177e4 2596 /*
a4b0672d
NP
2597 * Error recovery is a bit difficult. We need to zero out blocks that
2598 * were newly allocated, and dirty them to ensure they get written out.
2599 * Buffers need to be attached to the page at this point, otherwise
2600 * the handling of potential IO errors during writeout would be hard
2601 * (could try doing synchronous writeout, but what if that fails too?)
1da177e4 2602 */
03158cd7
NP
2603 attach_nobh_buffers(page, head);
2604 page_zero_new_buffers(page, from, to);
a4b0672d 2605
03158cd7
NP
2606out_release:
2607 unlock_page(page);
2608 page_cache_release(page);
2609 *pagep = NULL;
a4b0672d 2610
03158cd7
NP
2611 if (pos + len > inode->i_size)
2612 vmtruncate(inode, inode->i_size);
a4b0672d 2613
1da177e4
LT
2614 return ret;
2615}
03158cd7 2616EXPORT_SYMBOL(nobh_write_begin);
1da177e4 2617
03158cd7
NP
2618int nobh_write_end(struct file *file, struct address_space *mapping,
2619 loff_t pos, unsigned len, unsigned copied,
2620 struct page *page, void *fsdata)
1da177e4
LT
2621{
2622 struct inode *inode = page->mapping->host;
efdc3131 2623 struct buffer_head *head = fsdata;
03158cd7 2624 struct buffer_head *bh;
5b41e74a 2625 BUG_ON(fsdata != NULL && page_has_buffers(page));
1da177e4 2626
5b41e74a
DM
2627 if (unlikely(copied < len) && !page_has_buffers(page))
2628 attach_nobh_buffers(page, head);
2629 if (page_has_buffers(page))
2630 return generic_write_end(file, mapping, pos, len,
2631 copied, page, fsdata);
a4b0672d 2632
22c8ca78 2633 SetPageUptodate(page);
1da177e4 2634 set_page_dirty(page);
03158cd7
NP
2635 if (pos+copied > inode->i_size) {
2636 i_size_write(inode, pos+copied);
1da177e4
LT
2637 mark_inode_dirty(inode);
2638 }
03158cd7
NP
2639
2640 unlock_page(page);
2641 page_cache_release(page);
2642
03158cd7
NP
2643 while (head) {
2644 bh = head;
2645 head = head->b_this_page;
2646 free_buffer_head(bh);
2647 }
2648
2649 return copied;
1da177e4 2650}
03158cd7 2651EXPORT_SYMBOL(nobh_write_end);
1da177e4
LT
2652
2653/*
2654 * nobh_writepage() - based on block_full_write_page() except
2655 * that it tries to operate without attaching bufferheads to
2656 * the page.
2657 */
2658int nobh_writepage(struct page *page, get_block_t *get_block,
2659 struct writeback_control *wbc)
2660{
2661 struct inode * const inode = page->mapping->host;
2662 loff_t i_size = i_size_read(inode);
2663 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2664 unsigned offset;
1da177e4
LT
2665 int ret;
2666
2667 /* Is the page fully inside i_size? */
2668 if (page->index < end_index)
2669 goto out;
2670
2671 /* Is the page fully outside i_size? (truncate in progress) */
2672 offset = i_size & (PAGE_CACHE_SIZE-1);
2673 if (page->index >= end_index+1 || !offset) {
2674 /*
2675 * The page may have dirty, unmapped buffers. For example,
2676 * they may have been added in ext3_writepage(). Make them
2677 * freeable here, so the page does not leak.
2678 */
2679#if 0
2680 /* Not really sure about this - do we need this ? */
2681 if (page->mapping->a_ops->invalidatepage)
2682 page->mapping->a_ops->invalidatepage(page, offset);
2683#endif
2684 unlock_page(page);
2685 return 0; /* don't care */
2686 }
2687
2688 /*
2689 * The page straddles i_size. It must be zeroed out on each and every
2690 * writepage invocation because it may be mmapped. "A file is mapped
2691 * in multiples of the page size. For a file that is not a multiple of
2692 * the page size, the remaining memory is zeroed when mapped, and
2693 * writes to that region are not written out to the file."
2694 */
eebd2aa3 2695 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
1da177e4
LT
2696out:
2697 ret = mpage_writepage(page, get_block, wbc);
2698 if (ret == -EAGAIN)
2699 ret = __block_write_full_page(inode, page, get_block, wbc);
2700 return ret;
2701}
2702EXPORT_SYMBOL(nobh_writepage);
2703
03158cd7
NP
2704int nobh_truncate_page(struct address_space *mapping,
2705 loff_t from, get_block_t *get_block)
1da177e4 2706{
1da177e4
LT
2707 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2708 unsigned offset = from & (PAGE_CACHE_SIZE-1);
03158cd7
NP
2709 unsigned blocksize;
2710 sector_t iblock;
2711 unsigned length, pos;
2712 struct inode *inode = mapping->host;
1da177e4 2713 struct page *page;
03158cd7
NP
2714 struct buffer_head map_bh;
2715 int err;
1da177e4 2716
03158cd7
NP
2717 blocksize = 1 << inode->i_blkbits;
2718 length = offset & (blocksize - 1);
2719
2720 /* Block boundary? Nothing to do */
2721 if (!length)
2722 return 0;
2723
2724 length = blocksize - length;
2725 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1da177e4 2726
1da177e4 2727 page = grab_cache_page(mapping, index);
03158cd7 2728 err = -ENOMEM;
1da177e4
LT
2729 if (!page)
2730 goto out;
2731
03158cd7
NP
2732 if (page_has_buffers(page)) {
2733has_buffers:
2734 unlock_page(page);
2735 page_cache_release(page);
2736 return block_truncate_page(mapping, from, get_block);
2737 }
2738
2739 /* Find the buffer that contains "offset" */
2740 pos = blocksize;
2741 while (offset >= pos) {
2742 iblock++;
2743 pos += blocksize;
2744 }
2745
2746 err = get_block(inode, iblock, &map_bh, 0);
2747 if (err)
2748 goto unlock;
2749 /* unmapped? It's a hole - nothing to do */
2750 if (!buffer_mapped(&map_bh))
2751 goto unlock;
2752
2753 /* Ok, it's mapped. Make sure it's up-to-date */
2754 if (!PageUptodate(page)) {
2755 err = mapping->a_ops->readpage(NULL, page);
2756 if (err) {
2757 page_cache_release(page);
2758 goto out;
2759 }
2760 lock_page(page);
2761 if (!PageUptodate(page)) {
2762 err = -EIO;
2763 goto unlock;
2764 }
2765 if (page_has_buffers(page))
2766 goto has_buffers;
1da177e4 2767 }
eebd2aa3 2768 zero_user(page, offset, length);
03158cd7
NP
2769 set_page_dirty(page);
2770 err = 0;
2771
2772unlock:
1da177e4
LT
2773 unlock_page(page);
2774 page_cache_release(page);
2775out:
03158cd7 2776 return err;
1da177e4
LT
2777}
2778EXPORT_SYMBOL(nobh_truncate_page);
2779
2780int block_truncate_page(struct address_space *mapping,
2781 loff_t from, get_block_t *get_block)
2782{
2783 pgoff_t index = from >> PAGE_CACHE_SHIFT;
2784 unsigned offset = from & (PAGE_CACHE_SIZE-1);
2785 unsigned blocksize;
54b21a79 2786 sector_t iblock;
1da177e4
LT
2787 unsigned length, pos;
2788 struct inode *inode = mapping->host;
2789 struct page *page;
2790 struct buffer_head *bh;
1da177e4
LT
2791 int err;
2792
2793 blocksize = 1 << inode->i_blkbits;
2794 length = offset & (blocksize - 1);
2795
2796 /* Block boundary? Nothing to do */
2797 if (!length)
2798 return 0;
2799
2800 length = blocksize - length;
54b21a79 2801 iblock = (sector_t)index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1da177e4
LT
2802
2803 page = grab_cache_page(mapping, index);
2804 err = -ENOMEM;
2805 if (!page)
2806 goto out;
2807
2808 if (!page_has_buffers(page))
2809 create_empty_buffers(page, blocksize, 0);
2810
2811 /* Find the buffer that contains "offset" */
2812 bh = page_buffers(page);
2813 pos = blocksize;
2814 while (offset >= pos) {
2815 bh = bh->b_this_page;
2816 iblock++;
2817 pos += blocksize;
2818 }
2819
2820 err = 0;
2821 if (!buffer_mapped(bh)) {
b0cf2321 2822 WARN_ON(bh->b_size != blocksize);
1da177e4
LT
2823 err = get_block(inode, iblock, bh, 0);
2824 if (err)
2825 goto unlock;
2826 /* unmapped? It's a hole - nothing to do */
2827 if (!buffer_mapped(bh))
2828 goto unlock;
2829 }
2830
2831 /* Ok, it's mapped. Make sure it's up-to-date */
2832 if (PageUptodate(page))
2833 set_buffer_uptodate(bh);
2834
33a266dd 2835 if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_unwritten(bh)) {
1da177e4
LT
2836 err = -EIO;
2837 ll_rw_block(READ, 1, &bh);
2838 wait_on_buffer(bh);
2839 /* Uhhuh. Read error. Complain and punt. */
2840 if (!buffer_uptodate(bh))
2841 goto unlock;
2842 }
2843
eebd2aa3 2844 zero_user(page, offset, length);
1da177e4
LT
2845 mark_buffer_dirty(bh);
2846 err = 0;
2847
2848unlock:
2849 unlock_page(page);
2850 page_cache_release(page);
2851out:
2852 return err;
2853}
2854
2855/*
2856 * The generic ->writepage function for buffer-backed address_spaces
2857 */
2858int block_write_full_page(struct page *page, get_block_t *get_block,
2859 struct writeback_control *wbc)
2860{
2861 struct inode * const inode = page->mapping->host;
2862 loff_t i_size = i_size_read(inode);
2863 const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
2864 unsigned offset;
1da177e4
LT
2865
2866 /* Is the page fully inside i_size? */
2867 if (page->index < end_index)
2868 return __block_write_full_page(inode, page, get_block, wbc);
2869
2870 /* Is the page fully outside i_size? (truncate in progress) */
2871 offset = i_size & (PAGE_CACHE_SIZE-1);
2872 if (page->index >= end_index+1 || !offset) {
2873 /*
2874 * The page may have dirty, unmapped buffers. For example,
2875 * they may have been added in ext3_writepage(). Make them
2876 * freeable here, so the page does not leak.
2877 */
aaa4059b 2878 do_invalidatepage(page, 0);
1da177e4
LT
2879 unlock_page(page);
2880 return 0; /* don't care */
2881 }
2882
2883 /*
2884 * The page straddles i_size. It must be zeroed out on each and every
2885 * writepage invokation because it may be mmapped. "A file is mapped
2886 * in multiples of the page size. For a file that is not a multiple of
2887 * the page size, the remaining memory is zeroed when mapped, and
2888 * writes to that region are not written out to the file."
2889 */
eebd2aa3 2890 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
1da177e4
LT
2891 return __block_write_full_page(inode, page, get_block, wbc);
2892}
2893
2894sector_t generic_block_bmap(struct address_space *mapping, sector_t block,
2895 get_block_t *get_block)
2896{
2897 struct buffer_head tmp;
2898 struct inode *inode = mapping->host;
2899 tmp.b_state = 0;
2900 tmp.b_blocknr = 0;
b0cf2321 2901 tmp.b_size = 1 << inode->i_blkbits;
1da177e4
LT
2902 get_block(inode, block, &tmp, 0);
2903 return tmp.b_blocknr;
2904}
2905
6712ecf8 2906static void end_bio_bh_io_sync(struct bio *bio, int err)
1da177e4
LT
2907{
2908 struct buffer_head *bh = bio->bi_private;
2909
1da177e4
LT
2910 if (err == -EOPNOTSUPP) {
2911 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
2912 set_bit(BH_Eopnotsupp, &bh->b_state);
2913 }
2914
2915 bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
2916 bio_put(bio);
1da177e4
LT
2917}
2918
2919int submit_bh(int rw, struct buffer_head * bh)
2920{
2921 struct bio *bio;
2922 int ret = 0;
2923
2924 BUG_ON(!buffer_locked(bh));
2925 BUG_ON(!buffer_mapped(bh));
2926 BUG_ON(!bh->b_end_io);
2927
48fd4f93
JA
2928 /*
2929 * Mask in barrier bit for a write (could be either a WRITE or a
2930 * WRITE_SYNC
2931 */
2932 if (buffer_ordered(bh) && (rw & WRITE))
2933 rw |= WRITE_BARRIER;
1da177e4
LT
2934
2935 /*
48fd4f93 2936 * Only clear out a write error when rewriting
1da177e4 2937 */
48fd4f93 2938 if (test_set_buffer_req(bh) && (rw & WRITE))
1da177e4
LT
2939 clear_buffer_write_io_error(bh);
2940
2941 /*
2942 * from here on down, it's all bio -- do the initial mapping,
2943 * submit_bio -> generic_make_request may further map this bio around
2944 */
2945 bio = bio_alloc(GFP_NOIO, 1);
2946
2947 bio->bi_sector = bh->b_blocknr * (bh->b_size >> 9);
2948 bio->bi_bdev = bh->b_bdev;
2949 bio->bi_io_vec[0].bv_page = bh->b_page;
2950 bio->bi_io_vec[0].bv_len = bh->b_size;
2951 bio->bi_io_vec[0].bv_offset = bh_offset(bh);
2952
2953 bio->bi_vcnt = 1;
2954 bio->bi_idx = 0;
2955 bio->bi_size = bh->b_size;
2956
2957 bio->bi_end_io = end_bio_bh_io_sync;
2958 bio->bi_private = bh;
2959
2960 bio_get(bio);
2961 submit_bio(rw, bio);
2962
2963 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2964 ret = -EOPNOTSUPP;
2965
2966 bio_put(bio);
2967 return ret;
2968}
2969
2970/**
2971 * ll_rw_block: low-level access to block devices (DEPRECATED)
a7662236 2972 * @rw: whether to %READ or %WRITE or %SWRITE or maybe %READA (readahead)
1da177e4
LT
2973 * @nr: number of &struct buffer_heads in the array
2974 * @bhs: array of pointers to &struct buffer_head
2975 *
a7662236
JK
2976 * ll_rw_block() takes an array of pointers to &struct buffer_heads, and
2977 * requests an I/O operation on them, either a %READ or a %WRITE. The third
2978 * %SWRITE is like %WRITE only we make sure that the *current* data in buffers
2979 * are sent to disk. The fourth %READA option is described in the documentation
2980 * for generic_make_request() which ll_rw_block() calls.
1da177e4
LT
2981 *
2982 * This function drops any buffer that it cannot get a lock on (with the
a7662236
JK
2983 * BH_Lock state bit) unless SWRITE is required, any buffer that appears to be
2984 * clean when doing a write request, and any buffer that appears to be
2985 * up-to-date when doing read request. Further it marks as clean buffers that
2986 * are processed for writing (the buffer cache won't assume that they are
2987 * actually clean until the buffer gets unlocked).
1da177e4
LT
2988 *
2989 * ll_rw_block sets b_end_io to simple completion handler that marks
2990 * the buffer up-to-date (if approriate), unlocks the buffer and wakes
2991 * any waiters.
2992 *
2993 * All of the buffers must be for the same device, and must also be a
2994 * multiple of the current approved size for the device.
2995 */
2996void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2997{
2998 int i;
2999
3000 for (i = 0; i < nr; i++) {
3001 struct buffer_head *bh = bhs[i];
3002
18ce3751 3003 if (rw == SWRITE || rw == SWRITE_SYNC)
a7662236 3004 lock_buffer(bh);
ca5de404 3005 else if (!trylock_buffer(bh))
1da177e4
LT
3006 continue;
3007
18ce3751 3008 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) {
1da177e4 3009 if (test_clear_buffer_dirty(bh)) {
76c3073a 3010 bh->b_end_io = end_buffer_write_sync;
e60e5c50 3011 get_bh(bh);
18ce3751
JA
3012 if (rw == SWRITE_SYNC)
3013 submit_bh(WRITE_SYNC, bh);
3014 else
3015 submit_bh(WRITE, bh);
1da177e4
LT
3016 continue;
3017 }
3018 } else {
1da177e4 3019 if (!buffer_uptodate(bh)) {
76c3073a 3020 bh->b_end_io = end_buffer_read_sync;
e60e5c50 3021 get_bh(bh);
1da177e4
LT
3022 submit_bh(rw, bh);
3023 continue;
3024 }
3025 }
3026 unlock_buffer(bh);
1da177e4
LT
3027 }
3028}
3029
3030/*
3031 * For a data-integrity writeout, we need to wait upon any in-progress I/O
3032 * and then start new I/O and then wait upon it. The caller must have a ref on
3033 * the buffer_head.
3034 */
3035int sync_dirty_buffer(struct buffer_head *bh)
3036{
3037 int ret = 0;
3038
3039 WARN_ON(atomic_read(&bh->b_count) < 1);
3040 lock_buffer(bh);
3041 if (test_clear_buffer_dirty(bh)) {
3042 get_bh(bh);
3043 bh->b_end_io = end_buffer_write_sync;
18ce3751 3044 ret = submit_bh(WRITE_SYNC, bh);
1da177e4
LT
3045 wait_on_buffer(bh);
3046 if (buffer_eopnotsupp(bh)) {
3047 clear_buffer_eopnotsupp(bh);
3048 ret = -EOPNOTSUPP;
3049 }
3050 if (!ret && !buffer_uptodate(bh))
3051 ret = -EIO;
3052 } else {
3053 unlock_buffer(bh);
3054 }
3055 return ret;
3056}
3057
3058/*
3059 * try_to_free_buffers() checks if all the buffers on this particular page
3060 * are unused, and releases them if so.
3061 *
3062 * Exclusion against try_to_free_buffers may be obtained by either
3063 * locking the page or by holding its mapping's private_lock.
3064 *
3065 * If the page is dirty but all the buffers are clean then we need to
3066 * be sure to mark the page clean as well. This is because the page
3067 * may be against a block device, and a later reattachment of buffers
3068 * to a dirty page will set *all* buffers dirty. Which would corrupt
3069 * filesystem data on the same device.
3070 *
3071 * The same applies to regular filesystem pages: if all the buffers are
3072 * clean then we set the page clean and proceed. To do that, we require
3073 * total exclusion from __set_page_dirty_buffers(). That is obtained with
3074 * private_lock.
3075 *
3076 * try_to_free_buffers() is non-blocking.
3077 */
3078static inline int buffer_busy(struct buffer_head *bh)
3079{
3080 return atomic_read(&bh->b_count) |
3081 (bh->b_state & ((1 << BH_Dirty) | (1 << BH_Lock)));
3082}
3083
3084static int
3085drop_buffers(struct page *page, struct buffer_head **buffers_to_free)
3086{
3087 struct buffer_head *head = page_buffers(page);
3088 struct buffer_head *bh;
3089
3090 bh = head;
3091 do {
de7d5a3b 3092 if (buffer_write_io_error(bh) && page->mapping)
1da177e4
LT
3093 set_bit(AS_EIO, &page->mapping->flags);
3094 if (buffer_busy(bh))
3095 goto failed;
3096 bh = bh->b_this_page;
3097 } while (bh != head);
3098
3099 do {
3100 struct buffer_head *next = bh->b_this_page;
3101
535ee2fb 3102 if (bh->b_assoc_map)
1da177e4
LT
3103 __remove_assoc_queue(bh);
3104 bh = next;
3105 } while (bh != head);
3106 *buffers_to_free = head;
3107 __clear_page_buffers(page);
3108 return 1;
3109failed:
3110 return 0;
3111}
3112
3113int try_to_free_buffers(struct page *page)
3114{
3115 struct address_space * const mapping = page->mapping;
3116 struct buffer_head *buffers_to_free = NULL;
3117 int ret = 0;
3118
3119 BUG_ON(!PageLocked(page));
ecdfc978 3120 if (PageWriteback(page))
1da177e4
LT
3121 return 0;
3122
3123 if (mapping == NULL) { /* can this still happen? */
3124 ret = drop_buffers(page, &buffers_to_free);
3125 goto out;
3126 }
3127
3128 spin_lock(&mapping->private_lock);
3129 ret = drop_buffers(page, &buffers_to_free);
ecdfc978
LT
3130
3131 /*
3132 * If the filesystem writes its buffers by hand (eg ext3)
3133 * then we can have clean buffers against a dirty page. We
3134 * clean the page here; otherwise the VM will never notice
3135 * that the filesystem did any IO at all.
3136 *
3137 * Also, during truncate, discard_buffer will have marked all
3138 * the page's buffers clean. We discover that here and clean
3139 * the page also.
87df7241
NP
3140 *
3141 * private_lock must be held over this entire operation in order
3142 * to synchronise against __set_page_dirty_buffers and prevent the
3143 * dirty bit from being lost.
ecdfc978
LT
3144 */
3145 if (ret)
3146 cancel_dirty_page(page, PAGE_CACHE_SIZE);
87df7241 3147 spin_unlock(&mapping->private_lock);
1da177e4
LT
3148out:
3149 if (buffers_to_free) {
3150 struct buffer_head *bh = buffers_to_free;
3151
3152 do {
3153 struct buffer_head *next = bh->b_this_page;
3154 free_buffer_head(bh);
3155 bh = next;
3156 } while (bh != buffers_to_free);
3157 }
3158 return ret;
3159}
3160EXPORT_SYMBOL(try_to_free_buffers);
3161
3978d717 3162void block_sync_page(struct page *page)
1da177e4
LT
3163{
3164 struct address_space *mapping;
3165
3166 smp_mb();
3167 mapping = page_mapping(page);
3168 if (mapping)
3169 blk_run_backing_dev(mapping->backing_dev_info, page);
1da177e4
LT
3170}
3171
3172/*
3173 * There are no bdflush tunables left. But distributions are
3174 * still running obsolete flush daemons, so we terminate them here.
3175 *
3176 * Use of bdflush() is deprecated and will be removed in a future kernel.
3177 * The `pdflush' kernel threads fully replace bdflush daemons and this call.
3178 */
3179asmlinkage long sys_bdflush(int func, long data)
3180{
3181 static int msg_count;
3182
3183 if (!capable(CAP_SYS_ADMIN))
3184 return -EPERM;
3185
3186 if (msg_count < 5) {
3187 msg_count++;
3188 printk(KERN_INFO
3189 "warning: process `%s' used the obsolete bdflush"
3190 " system call\n", current->comm);
3191 printk(KERN_INFO "Fix your initscripts?\n");
3192 }
3193
3194 if (func == 1)
3195 do_exit(0);
3196 return 0;
3197}
3198
3199/*
3200 * Buffer-head allocation
3201 */
e18b890b 3202static struct kmem_cache *bh_cachep;
1da177e4
LT
3203
3204/*
3205 * Once the number of bh's in the machine exceeds this level, we start
3206 * stripping them in writeback.
3207 */
3208static int max_buffer_heads;
3209
3210int buffer_heads_over_limit;
3211
3212struct bh_accounting {
3213 int nr; /* Number of live bh's */
3214 int ratelimit; /* Limit cacheline bouncing */
3215};
3216
3217static DEFINE_PER_CPU(struct bh_accounting, bh_accounting) = {0, 0};
3218
3219static void recalc_bh_state(void)
3220{
3221 int i;
3222 int tot = 0;
3223
3224 if (__get_cpu_var(bh_accounting).ratelimit++ < 4096)
3225 return;
3226 __get_cpu_var(bh_accounting).ratelimit = 0;
8a143426 3227 for_each_online_cpu(i)
1da177e4
LT
3228 tot += per_cpu(bh_accounting, i).nr;
3229 buffer_heads_over_limit = (tot > max_buffer_heads);
3230}
3231
dd0fc66f 3232struct buffer_head *alloc_buffer_head(gfp_t gfp_flags)
1da177e4 3233{
488514d1 3234 struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags);
1da177e4 3235 if (ret) {
a35afb83 3236 INIT_LIST_HEAD(&ret->b_assoc_buffers);
736c7b80 3237 get_cpu_var(bh_accounting).nr++;
1da177e4 3238 recalc_bh_state();
736c7b80 3239 put_cpu_var(bh_accounting);
1da177e4
LT
3240 }
3241 return ret;
3242}
3243EXPORT_SYMBOL(alloc_buffer_head);
3244
3245void free_buffer_head(struct buffer_head *bh)
3246{
3247 BUG_ON(!list_empty(&bh->b_assoc_buffers));
3248 kmem_cache_free(bh_cachep, bh);
736c7b80 3249 get_cpu_var(bh_accounting).nr--;
1da177e4 3250 recalc_bh_state();
736c7b80 3251 put_cpu_var(bh_accounting);
1da177e4
LT
3252}
3253EXPORT_SYMBOL(free_buffer_head);
3254
1da177e4
LT
3255static void buffer_exit_cpu(int cpu)
3256{
3257 int i;
3258 struct bh_lru *b = &per_cpu(bh_lrus, cpu);
3259
3260 for (i = 0; i < BH_LRU_SIZE; i++) {
3261 brelse(b->bhs[i]);
3262 b->bhs[i] = NULL;
3263 }
8a143426
ED
3264 get_cpu_var(bh_accounting).nr += per_cpu(bh_accounting, cpu).nr;
3265 per_cpu(bh_accounting, cpu).nr = 0;
3266 put_cpu_var(bh_accounting);
1da177e4
LT
3267}
3268
3269static int buffer_cpu_notify(struct notifier_block *self,
3270 unsigned long action, void *hcpu)
3271{
8bb78442 3272 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
1da177e4
LT
3273 buffer_exit_cpu((unsigned long)hcpu);
3274 return NOTIFY_OK;
3275}
1da177e4 3276
389d1b08 3277/**
a6b91919 3278 * bh_uptodate_or_lock - Test whether the buffer is uptodate
389d1b08
AK
3279 * @bh: struct buffer_head
3280 *
3281 * Return true if the buffer is up-to-date and false,
3282 * with the buffer locked, if not.
3283 */
3284int bh_uptodate_or_lock(struct buffer_head *bh)
3285{
3286 if (!buffer_uptodate(bh)) {
3287 lock_buffer(bh);
3288 if (!buffer_uptodate(bh))
3289 return 0;
3290 unlock_buffer(bh);
3291 }
3292 return 1;
3293}
3294EXPORT_SYMBOL(bh_uptodate_or_lock);
3295
3296/**
a6b91919 3297 * bh_submit_read - Submit a locked buffer for reading
389d1b08
AK
3298 * @bh: struct buffer_head
3299 *
3300 * Returns zero on success and -EIO on error.
3301 */
3302int bh_submit_read(struct buffer_head *bh)
3303{
3304 BUG_ON(!buffer_locked(bh));
3305
3306 if (buffer_uptodate(bh)) {
3307 unlock_buffer(bh);
3308 return 0;
3309 }
3310
3311 get_bh(bh);
3312 bh->b_end_io = end_buffer_read_sync;
3313 submit_bh(READ, bh);
3314 wait_on_buffer(bh);
3315 if (buffer_uptodate(bh))
3316 return 0;
3317 return -EIO;
3318}
3319EXPORT_SYMBOL(bh_submit_read);
3320
b98938c3 3321static void
51cc5068 3322init_buffer_head(void *data)
b98938c3
CL
3323{
3324 struct buffer_head *bh = data;
3325
3326 memset(bh, 0, sizeof(*bh));
3327 INIT_LIST_HEAD(&bh->b_assoc_buffers);
3328}
3329
1da177e4
LT
3330void __init buffer_init(void)
3331{
3332 int nrpages;
3333
b98938c3
CL
3334 bh_cachep = kmem_cache_create("buffer_head",
3335 sizeof(struct buffer_head), 0,
3336 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
3337 SLAB_MEM_SPREAD),
3338 init_buffer_head);
1da177e4
LT
3339
3340 /*
3341 * Limit the bh occupancy to 10% of ZONE_NORMAL
3342 */
3343 nrpages = (nr_free_buffer_pages() * 10) / 100;
3344 max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
3345 hotcpu_notifier(buffer_cpu_notify, 0);
3346}
3347
3348EXPORT_SYMBOL(__bforget);
3349EXPORT_SYMBOL(__brelse);
3350EXPORT_SYMBOL(__wait_on_buffer);
3351EXPORT_SYMBOL(block_commit_write);
3352EXPORT_SYMBOL(block_prepare_write);
54171690 3353EXPORT_SYMBOL(block_page_mkwrite);
1da177e4
LT
3354EXPORT_SYMBOL(block_read_full_page);
3355EXPORT_SYMBOL(block_sync_page);
3356EXPORT_SYMBOL(block_truncate_page);
3357EXPORT_SYMBOL(block_write_full_page);
89e10787 3358EXPORT_SYMBOL(cont_write_begin);
1da177e4
LT
3359EXPORT_SYMBOL(end_buffer_read_sync);
3360EXPORT_SYMBOL(end_buffer_write_sync);
3361EXPORT_SYMBOL(file_fsync);
3362EXPORT_SYMBOL(fsync_bdev);
3363EXPORT_SYMBOL(generic_block_bmap);
05eb0b51 3364EXPORT_SYMBOL(generic_cont_expand_simple);
1da177e4
LT
3365EXPORT_SYMBOL(init_buffer);
3366EXPORT_SYMBOL(invalidate_bdev);
3367EXPORT_SYMBOL(ll_rw_block);
3368EXPORT_SYMBOL(mark_buffer_dirty);
3369EXPORT_SYMBOL(submit_bh);
3370EXPORT_SYMBOL(sync_dirty_buffer);
3371EXPORT_SYMBOL(unlock_buffer);