]> bbs.cooldavid.org Git - net-next-2.6.git/blame - mm/filemap_xip.c
[PATCH] xip: ext2: execute in place
[net-next-2.6.git] / mm / filemap_xip.c
CommitLineData
ceffc078
CO
1/*
2 * linux/mm/filemap_xip.c
3 *
4 * Copyright (C) 2005 IBM Corporation
5 * Author: Carsten Otte <cotte@de.ibm.com>
6 *
7 * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
8 *
9 */
10
11#include <linux/fs.h>
12#include <linux/pagemap.h>
13#include <linux/module.h>
14#include <linux/uio.h>
15#include <linux/rmap.h>
16#include <asm/tlbflush.h>
17#include "filemap.h"
18
19/*
20 * This is a file read routine for execute in place files, and uses
21 * the mapping->a_ops->get_xip_page() function for the actual low-level
22 * stuff.
23 *
24 * Note the struct file* is not used at all. It may be NULL.
25 */
26static void
27do_xip_mapping_read(struct address_space *mapping,
28 struct file_ra_state *_ra,
29 struct file *filp,
30 loff_t *ppos,
31 read_descriptor_t *desc,
32 read_actor_t actor)
33{
34 struct inode *inode = mapping->host;
35 unsigned long index, end_index, offset;
36 loff_t isize;
37
38 BUG_ON(!mapping->a_ops->get_xip_page);
39
40 index = *ppos >> PAGE_CACHE_SHIFT;
41 offset = *ppos & ~PAGE_CACHE_MASK;
42
43 isize = i_size_read(inode);
44 if (!isize)
45 goto out;
46
47 end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
48 for (;;) {
49 struct page *page;
50 unsigned long nr, ret;
51
52 /* nr is the maximum number of bytes to copy from this page */
53 nr = PAGE_CACHE_SIZE;
54 if (index >= end_index) {
55 if (index > end_index)
56 goto out;
57 nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
58 if (nr <= offset) {
59 goto out;
60 }
61 }
62 nr = nr - offset;
63
64 page = mapping->a_ops->get_xip_page(mapping,
65 index*(PAGE_SIZE/512), 0);
66 if (!page)
67 goto no_xip_page;
68 if (unlikely(IS_ERR(page))) {
69 if (PTR_ERR(page) == -ENODATA) {
70 /* sparse */
71 page = virt_to_page(empty_zero_page);
72 } else {
73 desc->error = PTR_ERR(page);
74 goto out;
75 }
76 } else
77 BUG_ON(!PageUptodate(page));
78
79 /* If users can be writing to this page using arbitrary
80 * virtual addresses, take care about potential aliasing
81 * before reading the page on the kernel side.
82 */
83 if (mapping_writably_mapped(mapping))
84 flush_dcache_page(page);
85
86 /*
87 * Ok, we have the page, and it's up-to-date, so
88 * now we can copy it to user space...
89 *
90 * The actor routine returns how many bytes were actually used..
91 * NOTE! This may not be the same as how much of a user buffer
92 * we filled up (we may be padding etc), so we can only update
93 * "pos" here (the actor routine has to update the user buffer
94 * pointers and the remaining count).
95 */
96 ret = actor(desc, page, offset, nr);
97 offset += ret;
98 index += offset >> PAGE_CACHE_SHIFT;
99 offset &= ~PAGE_CACHE_MASK;
100
101 if (ret == nr && desc->count)
102 continue;
103 goto out;
104
105no_xip_page:
106 /* Did not get the page. Report it */
107 desc->error = -EIO;
108 goto out;
109 }
110
111out:
112 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
113 if (filp)
114 file_accessed(filp);
115}
116
117/*
118 * This is the "read()" routine for all filesystems
119 * that uses the get_xip_page address space operation.
120 */
121static ssize_t
122__xip_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
123 unsigned long nr_segs, loff_t *ppos)
124{
125 struct file *filp = iocb->ki_filp;
126 ssize_t retval;
127 unsigned long seg;
128 size_t count;
129
130 count = 0;
131 for (seg = 0; seg < nr_segs; seg++) {
132 const struct iovec *iv = &iov[seg];
133
134 /*
135 * If any segment has a negative length, or the cumulative
136 * length ever wraps negative then return -EINVAL.
137 */
138 count += iv->iov_len;
139 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
140 return -EINVAL;
141 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
142 continue;
143 if (seg == 0)
144 return -EFAULT;
145 nr_segs = seg;
146 count -= iv->iov_len; /* This segment is no good */
147 break;
148 }
149
150 retval = 0;
151 if (count) {
152 for (seg = 0; seg < nr_segs; seg++) {
153 read_descriptor_t desc;
154
155 desc.written = 0;
156 desc.arg.buf = iov[seg].iov_base;
157 desc.count = iov[seg].iov_len;
158 if (desc.count == 0)
159 continue;
160 desc.error = 0;
161 do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
162 ppos, &desc, file_read_actor);
163 retval += desc.written;
164 if (!retval) {
165 retval = desc.error;
166 break;
167 }
168 }
169 }
170 return retval;
171}
172
173ssize_t
174xip_file_aio_read(struct kiocb *iocb, char __user *buf, size_t count,
175 loff_t pos)
176{
177 struct iovec local_iov = { .iov_base = buf, .iov_len = count };
178
179 BUG_ON(iocb->ki_pos != pos);
180 return __xip_file_aio_read(iocb, &local_iov, 1, &iocb->ki_pos);
181}
182EXPORT_SYMBOL_GPL(xip_file_aio_read);
183
184ssize_t
185xip_file_readv(struct file *filp, const struct iovec *iov,
186 unsigned long nr_segs, loff_t *ppos)
187{
188 struct kiocb kiocb;
189
190 init_sync_kiocb(&kiocb, filp);
191 return __xip_file_aio_read(&kiocb, iov, nr_segs, ppos);
192}
193EXPORT_SYMBOL_GPL(xip_file_readv);
194
195ssize_t
196xip_file_sendfile(struct file *in_file, loff_t *ppos,
197 size_t count, read_actor_t actor, void *target)
198{
199 read_descriptor_t desc;
200
201 if (!count)
202 return 0;
203
204 desc.written = 0;
205 desc.count = count;
206 desc.arg.data = target;
207 desc.error = 0;
208
209 do_xip_mapping_read(in_file->f_mapping, &in_file->f_ra, in_file,
210 ppos, &desc, actor);
211 if (desc.written)
212 return desc.written;
213 return desc.error;
214}
215EXPORT_SYMBOL_GPL(xip_file_sendfile);
216
217/*
218 * __xip_unmap is invoked from xip_unmap and
219 * xip_write
220 *
221 * This function walks all vmas of the address_space and unmaps the
222 * empty_zero_page when found at pgoff. Should it go in rmap.c?
223 */
224static void
225__xip_unmap (struct address_space * mapping,
226 unsigned long pgoff)
227{
228 struct vm_area_struct *vma;
229 struct mm_struct *mm;
230 struct prio_tree_iter iter;
231 unsigned long address;
232 pte_t *pte;
233 pte_t pteval;
234
235 spin_lock(&mapping->i_mmap_lock);
236 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
237 mm = vma->vm_mm;
238 address = vma->vm_start +
239 ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
240 BUG_ON(address < vma->vm_start || address >= vma->vm_end);
241 /*
242 * We need the page_table_lock to protect us from page faults,
243 * munmap, fork, etc...
244 */
245 pte = page_check_address(virt_to_page(empty_zero_page), mm,
246 address);
247 if (!IS_ERR(pte)) {
248 /* Nuke the page table entry. */
249 flush_cache_page(vma, address, pte_pfn(pte));
250 pteval = ptep_clear_flush(vma, address, pte);
251 BUG_ON(pte_dirty(pteval));
252 pte_unmap(pte);
253 spin_unlock(&mm->page_table_lock);
254 }
255 }
256 spin_unlock(&mapping->i_mmap_lock);
257}
258
259/*
260 * xip_nopage() is invoked via the vma operations vector for a
261 * mapped memory region to read in file data during a page fault.
262 *
263 * This function is derived from filemap_nopage, but used for execute in place
264 */
265static struct page *
266xip_file_nopage(struct vm_area_struct * area,
267 unsigned long address,
268 int *type)
269{
270 struct file *file = area->vm_file;
271 struct address_space *mapping = file->f_mapping;
272 struct inode *inode = mapping->host;
273 struct page *page;
274 unsigned long size, pgoff, endoff;
275
276 pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT)
277 + area->vm_pgoff;
278 endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT)
279 + area->vm_pgoff;
280
281 size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
282 if (pgoff >= size) {
283 return NULL;
284 }
285
286 page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
287 if (!IS_ERR(page)) {
288 BUG_ON(!PageUptodate(page));
289 return page;
290 }
291 if (PTR_ERR(page) != -ENODATA)
292 return NULL;
293
294 /* sparse block */
295 if ((area->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
296 (area->vm_flags & (VM_SHARED| VM_MAYSHARE)) &&
297 (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
298 /* maybe shared writable, allocate new block */
299 page = mapping->a_ops->get_xip_page (mapping,
300 pgoff*(PAGE_SIZE/512), 1);
301 if (IS_ERR(page))
302 return NULL;
303 BUG_ON(!PageUptodate(page));
304 /* unmap page at pgoff from all other vmas */
305 __xip_unmap(mapping, pgoff);
306 } else {
307 /* not shared and writable, use empty_zero_page */
308 page = virt_to_page(empty_zero_page);
309 }
310
311 return page;
312}
313
314static struct vm_operations_struct xip_file_vm_ops = {
315 .nopage = xip_file_nopage,
316};
317
318int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
319{
320 BUG_ON(!file->f_mapping->a_ops->get_xip_page);
321
322 file_accessed(file);
323 vma->vm_ops = &xip_file_vm_ops;
324 return 0;
325}
326EXPORT_SYMBOL_GPL(xip_file_mmap);
327
328static ssize_t
329do_xip_file_write(struct kiocb *iocb, const struct iovec *iov,
330 unsigned long nr_segs, loff_t pos, loff_t *ppos,
331 size_t count)
332{
333 struct file *file = iocb->ki_filp;
334 struct address_space * mapping = file->f_mapping;
335 struct address_space_operations *a_ops = mapping->a_ops;
336 struct inode *inode = mapping->host;
337 long status = 0;
338 struct page *page;
339 size_t bytes;
340 const struct iovec *cur_iov = iov; /* current iovec */
341 size_t iov_base = 0; /* offset in the current iovec */
342 char __user *buf;
343 ssize_t written = 0;
344
345 BUG_ON(!mapping->a_ops->get_xip_page);
346
347 buf = iov->iov_base;
348 do {
349 unsigned long index;
350 unsigned long offset;
351 size_t copied;
352
353 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
354 index = pos >> PAGE_CACHE_SHIFT;
355 bytes = PAGE_CACHE_SIZE - offset;
356 if (bytes > count)
357 bytes = count;
358
359 /*
360 * Bring in the user page that we will copy from _first_.
361 * Otherwise there's a nasty deadlock on copying from the
362 * same page as we're writing to, without it being marked
363 * up-to-date.
364 */
365 fault_in_pages_readable(buf, bytes);
366
367 page = a_ops->get_xip_page(mapping,
368 index*(PAGE_SIZE/512), 0);
369 if (IS_ERR(page) && (PTR_ERR(page) == -ENODATA)) {
370 /* we allocate a new page unmap it */
371 page = a_ops->get_xip_page(mapping,
372 index*(PAGE_SIZE/512), 1);
373 if (!IS_ERR(page))
374 /* unmap page at pgoff from all other vmas */
375 __xip_unmap(mapping, index);
376
377 }
378
379 if (IS_ERR(page)) {
380 status = PTR_ERR(page);
381 break;
382 }
383
384 BUG_ON(!PageUptodate(page));
385
386 if (likely(nr_segs == 1))
387 copied = filemap_copy_from_user(page, offset,
388 buf, bytes);
389 else
390 copied = filemap_copy_from_user_iovec(page, offset,
391 cur_iov, iov_base, bytes);
392 flush_dcache_page(page);
393 if (likely(copied > 0)) {
394 status = copied;
395
396 if (status >= 0) {
397 written += status;
398 count -= status;
399 pos += status;
400 buf += status;
401 if (unlikely(nr_segs > 1))
402 filemap_set_next_iovec(&cur_iov,
403 &iov_base, status);
404 }
405 }
406 if (unlikely(copied != bytes))
407 if (status >= 0)
408 status = -EFAULT;
409 if (status < 0)
410 break;
411 } while (count);
412 *ppos = pos;
413 /*
414 * No need to use i_size_read() here, the i_size
415 * cannot change under us because we hold i_sem.
416 */
417 if (pos > inode->i_size) {
418 i_size_write(inode, pos);
419 mark_inode_dirty(inode);
420 }
421
422 return written ? written : status;
423}
424
425static ssize_t
426xip_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov,
427 unsigned long nr_segs, loff_t *ppos)
428{
429 struct file *file = iocb->ki_filp;
430 struct address_space * mapping = file->f_mapping;
431 size_t ocount; /* original count */
432 size_t count; /* after file limit checks */
433 struct inode *inode = mapping->host;
434 unsigned long seg;
435 loff_t pos;
436 ssize_t written;
437 ssize_t err;
438
439 ocount = 0;
440 for (seg = 0; seg < nr_segs; seg++) {
441 const struct iovec *iv = &iov[seg];
442
443 /*
444 * If any segment has a negative length, or the cumulative
445 * length ever wraps negative then return -EINVAL.
446 */
447 ocount += iv->iov_len;
448 if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
449 return -EINVAL;
450 if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
451 continue;
452 if (seg == 0)
453 return -EFAULT;
454 nr_segs = seg;
455 ocount -= iv->iov_len; /* This segment is no good */
456 break;
457 }
458
459 count = ocount;
460 pos = *ppos;
461
462 vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
463
464 written = 0;
465
466 err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
467 if (err)
468 goto out;
469
470 if (count == 0)
471 goto out;
472
473 err = remove_suid(file->f_dentry);
474 if (err)
475 goto out;
476
477 inode_update_time(inode, 1);
478
479 /* use execute in place to copy directly to disk */
480 written = do_xip_file_write (iocb, iov,
481 nr_segs, pos, ppos, count);
482 out:
483 return written ? written : err;
484}
485
486static ssize_t
487__xip_file_write_nolock(struct file *file, const struct iovec *iov,
488 unsigned long nr_segs, loff_t *ppos)
489{
490 struct kiocb kiocb;
491
492 init_sync_kiocb(&kiocb, file);
493 return xip_file_aio_write_nolock(&kiocb, iov, nr_segs, ppos);
494}
495
496ssize_t
497xip_file_aio_write(struct kiocb *iocb, const char __user *buf,
498 size_t count, loff_t pos)
499{
500 struct file *file = iocb->ki_filp;
501 struct address_space *mapping = file->f_mapping;
502 struct inode *inode = mapping->host;
503 ssize_t ret;
504 struct iovec local_iov = { .iov_base = (void __user *)buf,
505 .iov_len = count };
506
507 BUG_ON(iocb->ki_pos != pos);
508
509 down(&inode->i_sem);
510 ret = xip_file_aio_write_nolock(iocb, &local_iov, 1, &iocb->ki_pos);
511 up(&inode->i_sem);
512 return ret;
513}
514EXPORT_SYMBOL_GPL(xip_file_aio_write);
515
516ssize_t xip_file_writev(struct file *file, const struct iovec *iov,
517 unsigned long nr_segs, loff_t *ppos)
518{
519 struct address_space *mapping = file->f_mapping;
520 struct inode *inode = mapping->host;
521 ssize_t ret;
522
523 down(&inode->i_sem);
524 ret = __xip_file_write_nolock(file, iov, nr_segs, ppos);
525 up(&inode->i_sem);
526 return ret;
527}
528EXPORT_SYMBOL_GPL(xip_file_writev);
529
530/*
531 * truncate a page used for execute in place
532 * functionality is analog to block_truncate_page but does use get_xip_page
533 * to get the page instead of page cache
534 */
535int
536xip_truncate_page(struct address_space *mapping, loff_t from)
537{
538 pgoff_t index = from >> PAGE_CACHE_SHIFT;
539 unsigned offset = from & (PAGE_CACHE_SIZE-1);
540 unsigned blocksize;
541 unsigned length;
542 struct page *page;
543 void *kaddr;
544 int err;
545
546 BUG_ON(!mapping->a_ops->get_xip_page);
547
548 blocksize = 1 << mapping->host->i_blkbits;
549 length = offset & (blocksize - 1);
550
551 /* Block boundary? Nothing to do */
552 if (!length)
553 return 0;
554
555 length = blocksize - length;
556
557 page = mapping->a_ops->get_xip_page(mapping,
558 index*(PAGE_SIZE/512), 0);
559 err = -ENOMEM;
560 if (!page)
561 goto out;
562 if (unlikely(IS_ERR(page))) {
563 if (PTR_ERR(page) == -ENODATA) {
564 /* Hole? No need to truncate */
565 return 0;
566 } else {
567 err = PTR_ERR(page);
568 goto out;
569 }
570 } else
571 BUG_ON(!PageUptodate(page));
572 kaddr = kmap_atomic(page, KM_USER0);
573 memset(kaddr + offset, 0, length);
574 kunmap_atomic(kaddr, KM_USER0);
575
576 flush_dcache_page(page);
577 err = 0;
578out:
579 return err;
580}
581EXPORT_SYMBOL_GPL(xip_truncate_page);