]> bbs.cooldavid.org Git - net-next-2.6.git/blame - fs/pipe.c
pipe: add support for shrinking and growing pipes
[net-next-2.6.git] / fs / pipe.c
CommitLineData
1da177e4
LT
1/*
2 * linux/fs/pipe.c
3 *
4 * Copyright (C) 1991, 1992, 1999 Linus Torvalds
5 */
6
7#include <linux/mm.h>
8#include <linux/file.h>
9#include <linux/poll.h>
10#include <linux/slab.h>
11#include <linux/module.h>
12#include <linux/init.h>
13#include <linux/fs.h>
35f3d14d 14#include <linux/log2.h>
1da177e4
LT
15#include <linux/mount.h>
16#include <linux/pipe_fs_i.h>
17#include <linux/uio.h>
18#include <linux/highmem.h>
5274f052 19#include <linux/pagemap.h>
db349509 20#include <linux/audit.h>
ba719bae 21#include <linux/syscalls.h>
1da177e4
LT
22
23#include <asm/uaccess.h>
24#include <asm/ioctls.h>
25
26/*
27 * We use a start+len construction, which provides full use of the
28 * allocated memory.
29 * -- Florian Coosmann (FGC)
30 *
31 * Reads with count = 0 should always return 0.
32 * -- Julian Bradfield 1999-06-07.
33 *
34 * FIFOs and Pipes now generate SIGIO for both readers and writers.
35 * -- Jeremy Elson <jelson@circlemud.org> 2001-08-16
36 *
37 * pipe_read & write cleanup
38 * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09
39 */
40
61e0d47c
MS
41static void pipe_lock_nested(struct pipe_inode_info *pipe, int subclass)
42{
43 if (pipe->inode)
44 mutex_lock_nested(&pipe->inode->i_mutex, subclass);
45}
46
47void pipe_lock(struct pipe_inode_info *pipe)
48{
49 /*
50 * pipe_lock() nests non-pipe inode locks (for writing to a file)
51 */
52 pipe_lock_nested(pipe, I_MUTEX_PARENT);
53}
54EXPORT_SYMBOL(pipe_lock);
55
56void pipe_unlock(struct pipe_inode_info *pipe)
57{
58 if (pipe->inode)
59 mutex_unlock(&pipe->inode->i_mutex);
60}
61EXPORT_SYMBOL(pipe_unlock);
62
63void pipe_double_lock(struct pipe_inode_info *pipe1,
64 struct pipe_inode_info *pipe2)
65{
66 BUG_ON(pipe1 == pipe2);
67
68 if (pipe1 < pipe2) {
69 pipe_lock_nested(pipe1, I_MUTEX_PARENT);
70 pipe_lock_nested(pipe2, I_MUTEX_CHILD);
71 } else {
023d43c7
PZ
72 pipe_lock_nested(pipe2, I_MUTEX_PARENT);
73 pipe_lock_nested(pipe1, I_MUTEX_CHILD);
61e0d47c
MS
74 }
75}
76
1da177e4 77/* Drop the inode semaphore and wait for a pipe event, atomically */
3a326a2c 78void pipe_wait(struct pipe_inode_info *pipe)
1da177e4
LT
79{
80 DEFINE_WAIT(wait);
81
d79fc0fc
IM
82 /*
83 * Pipes are system-local resources, so sleeping on them
84 * is considered a noninteractive wait:
85 */
af927232 86 prepare_to_wait(&pipe->wait, &wait, TASK_INTERRUPTIBLE);
61e0d47c 87 pipe_unlock(pipe);
1da177e4 88 schedule();
3a326a2c 89 finish_wait(&pipe->wait, &wait);
61e0d47c 90 pipe_lock(pipe);
1da177e4
LT
91}
92
858119e1 93static int
f6762b7a
JA
94pipe_iov_copy_from_user(void *to, struct iovec *iov, unsigned long len,
95 int atomic)
1da177e4
LT
96{
97 unsigned long copy;
98
99 while (len > 0) {
100 while (!iov->iov_len)
101 iov++;
102 copy = min_t(unsigned long, len, iov->iov_len);
103
f6762b7a
JA
104 if (atomic) {
105 if (__copy_from_user_inatomic(to, iov->iov_base, copy))
106 return -EFAULT;
107 } else {
108 if (copy_from_user(to, iov->iov_base, copy))
109 return -EFAULT;
110 }
1da177e4
LT
111 to += copy;
112 len -= copy;
113 iov->iov_base += copy;
114 iov->iov_len -= copy;
115 }
116 return 0;
117}
118
858119e1 119static int
f6762b7a
JA
120pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len,
121 int atomic)
1da177e4
LT
122{
123 unsigned long copy;
124
125 while (len > 0) {
126 while (!iov->iov_len)
127 iov++;
128 copy = min_t(unsigned long, len, iov->iov_len);
129
f6762b7a
JA
130 if (atomic) {
131 if (__copy_to_user_inatomic(iov->iov_base, from, copy))
132 return -EFAULT;
133 } else {
134 if (copy_to_user(iov->iov_base, from, copy))
135 return -EFAULT;
136 }
1da177e4
LT
137 from += copy;
138 len -= copy;
139 iov->iov_base += copy;
140 iov->iov_len -= copy;
141 }
142 return 0;
143}
144
f6762b7a
JA
145/*
146 * Attempt to pre-fault in the user memory, so we can use atomic copies.
147 * Returns the number of bytes not faulted in.
148 */
149static int iov_fault_in_pages_write(struct iovec *iov, unsigned long len)
150{
151 while (!iov->iov_len)
152 iov++;
153
154 while (len > 0) {
155 unsigned long this_len;
156
157 this_len = min_t(unsigned long, len, iov->iov_len);
158 if (fault_in_pages_writeable(iov->iov_base, this_len))
159 break;
160
161 len -= this_len;
162 iov++;
163 }
164
165 return len;
166}
167
168/*
169 * Pre-fault in the user memory, so we can use atomic copies.
170 */
171static void iov_fault_in_pages_read(struct iovec *iov, unsigned long len)
172{
173 while (!iov->iov_len)
174 iov++;
175
176 while (len > 0) {
177 unsigned long this_len;
178
179 this_len = min_t(unsigned long, len, iov->iov_len);
180 fault_in_pages_readable(iov->iov_base, this_len);
181 len -= this_len;
182 iov++;
183 }
184}
185
341b446b
IM
186static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
187 struct pipe_buffer *buf)
1da177e4
LT
188{
189 struct page *page = buf->page;
190
5274f052
JA
191 /*
192 * If nobody else uses this page, and we don't already have a
193 * temporary page, let's keep track of it as a one-deep
341b446b 194 * allocation cache. (Otherwise just release our reference to it)
5274f052 195 */
341b446b 196 if (page_count(page) == 1 && !pipe->tmp_page)
923f4f23 197 pipe->tmp_page = page;
341b446b
IM
198 else
199 page_cache_release(page);
1da177e4
LT
200}
201
0845718d
JA
202/**
203 * generic_pipe_buf_map - virtually map a pipe buffer
204 * @pipe: the pipe that the buffer belongs to
205 * @buf: the buffer that should be mapped
206 * @atomic: whether to use an atomic map
207 *
208 * Description:
209 * This function returns a kernel virtual address mapping for the
b51d63c6 210 * pipe_buffer passed in @buf. If @atomic is set, an atomic map is provided
0845718d
JA
211 * and the caller has to be careful not to fault before calling
212 * the unmap function.
213 *
214 * Note that this function occupies KM_USER0 if @atomic != 0.
215 */
f84d7519 216void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
f6762b7a 217 struct pipe_buffer *buf, int atomic)
1da177e4 218{
f6762b7a
JA
219 if (atomic) {
220 buf->flags |= PIPE_BUF_FLAG_ATOMIC;
221 return kmap_atomic(buf->page, KM_USER0);
222 }
223
1da177e4
LT
224 return kmap(buf->page);
225}
226
0845718d
JA
227/**
228 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
229 * @pipe: the pipe that the buffer belongs to
230 * @buf: the buffer that should be unmapped
231 * @map_data: the data that the mapping function returned
232 *
233 * Description:
234 * This function undoes the mapping that ->map() provided.
235 */
f84d7519 236void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
f6762b7a 237 struct pipe_buffer *buf, void *map_data)
1da177e4 238{
f6762b7a
JA
239 if (buf->flags & PIPE_BUF_FLAG_ATOMIC) {
240 buf->flags &= ~PIPE_BUF_FLAG_ATOMIC;
241 kunmap_atomic(map_data, KM_USER0);
242 } else
243 kunmap(buf->page);
1da177e4
LT
244}
245
0845718d 246/**
b51d63c6 247 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
0845718d
JA
248 * @pipe: the pipe that the buffer belongs to
249 * @buf: the buffer to attempt to steal
250 *
251 * Description:
b51d63c6 252 * This function attempts to steal the &struct page attached to
0845718d
JA
253 * @buf. If successful, this function returns 0 and returns with
254 * the page locked. The caller may then reuse the page for whatever
b51d63c6 255 * he wishes; the typical use is insertion into a different file
0845718d
JA
256 * page cache.
257 */
330ab716
JA
258int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
259 struct pipe_buffer *buf)
5abc97aa 260{
46e678c9
JA
261 struct page *page = buf->page;
262
0845718d
JA
263 /*
264 * A reference of one is golden, that means that the owner of this
265 * page is the only one holding a reference to it. lock the page
266 * and return OK.
267 */
46e678c9 268 if (page_count(page) == 1) {
46e678c9
JA
269 lock_page(page);
270 return 0;
271 }
272
273 return 1;
5abc97aa
JA
274}
275
0845718d 276/**
b51d63c6 277 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
0845718d
JA
278 * @pipe: the pipe that the buffer belongs to
279 * @buf: the buffer to get a reference to
280 *
281 * Description:
282 * This function grabs an extra reference to @buf. It's used in
283 * in the tee() system call, when we duplicate the buffers in one
284 * pipe into another.
285 */
286void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
70524490
JA
287{
288 page_cache_get(buf->page);
289}
290
0845718d
JA
291/**
292 * generic_pipe_buf_confirm - verify contents of the pipe buffer
79685b8d 293 * @info: the pipe that the buffer belongs to
0845718d
JA
294 * @buf: the buffer to confirm
295 *
296 * Description:
297 * This function does nothing, because the generic pipe code uses
298 * pages that are always good when inserted into the pipe.
299 */
cac36bb0
JA
300int generic_pipe_buf_confirm(struct pipe_inode_info *info,
301 struct pipe_buffer *buf)
f84d7519
JA
302{
303 return 0;
304}
305
6818173b
MS
306/**
307 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
308 * @pipe: the pipe that the buffer belongs to
309 * @buf: the buffer to put a reference to
310 *
311 * Description:
312 * This function releases a reference to @buf.
313 */
314void generic_pipe_buf_release(struct pipe_inode_info *pipe,
315 struct pipe_buffer *buf)
316{
317 page_cache_release(buf->page);
318}
319
d4c3cca9 320static const struct pipe_buf_operations anon_pipe_buf_ops = {
1da177e4 321 .can_merge = 1,
f84d7519
JA
322 .map = generic_pipe_buf_map,
323 .unmap = generic_pipe_buf_unmap,
cac36bb0 324 .confirm = generic_pipe_buf_confirm,
1da177e4 325 .release = anon_pipe_buf_release,
330ab716 326 .steal = generic_pipe_buf_steal,
f84d7519 327 .get = generic_pipe_buf_get,
1da177e4
LT
328};
329
330static ssize_t
ee0b3e67
BP
331pipe_read(struct kiocb *iocb, const struct iovec *_iov,
332 unsigned long nr_segs, loff_t pos)
1da177e4 333{
ee0b3e67 334 struct file *filp = iocb->ki_filp;
0f7fc9e4 335 struct inode *inode = filp->f_path.dentry->d_inode;
923f4f23 336 struct pipe_inode_info *pipe;
1da177e4
LT
337 int do_wakeup;
338 ssize_t ret;
339 struct iovec *iov = (struct iovec *)_iov;
340 size_t total_len;
341
342 total_len = iov_length(iov, nr_segs);
343 /* Null read succeeds. */
344 if (unlikely(total_len == 0))
345 return 0;
346
347 do_wakeup = 0;
348 ret = 0;
9aeedfc4 349 mutex_lock(&inode->i_mutex);
923f4f23 350 pipe = inode->i_pipe;
1da177e4 351 for (;;) {
923f4f23 352 int bufs = pipe->nrbufs;
1da177e4 353 if (bufs) {
923f4f23
IM
354 int curbuf = pipe->curbuf;
355 struct pipe_buffer *buf = pipe->bufs + curbuf;
d4c3cca9 356 const struct pipe_buf_operations *ops = buf->ops;
1da177e4
LT
357 void *addr;
358 size_t chars = buf->len;
f6762b7a 359 int error, atomic;
1da177e4
LT
360
361 if (chars > total_len)
362 chars = total_len;
363
cac36bb0 364 error = ops->confirm(pipe, buf);
f84d7519 365 if (error) {
5274f052 366 if (!ret)
f84d7519 367 error = ret;
5274f052
JA
368 break;
369 }
f84d7519 370
f6762b7a
JA
371 atomic = !iov_fault_in_pages_write(iov, chars);
372redo:
373 addr = ops->map(pipe, buf, atomic);
374 error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars, atomic);
375 ops->unmap(pipe, buf, addr);
1da177e4 376 if (unlikely(error)) {
f6762b7a
JA
377 /*
378 * Just retry with the slow path if we failed.
379 */
380 if (atomic) {
381 atomic = 0;
382 goto redo;
383 }
341b446b 384 if (!ret)
f6762b7a 385 ret = error;
1da177e4
LT
386 break;
387 }
388 ret += chars;
389 buf->offset += chars;
390 buf->len -= chars;
391 if (!buf->len) {
392 buf->ops = NULL;
923f4f23 393 ops->release(pipe, buf);
35f3d14d 394 curbuf = (curbuf + 1) & (pipe->buffers - 1);
923f4f23
IM
395 pipe->curbuf = curbuf;
396 pipe->nrbufs = --bufs;
1da177e4
LT
397 do_wakeup = 1;
398 }
399 total_len -= chars;
400 if (!total_len)
401 break; /* common path: read succeeded */
402 }
403 if (bufs) /* More to do? */
404 continue;
923f4f23 405 if (!pipe->writers)
1da177e4 406 break;
923f4f23 407 if (!pipe->waiting_writers) {
1da177e4
LT
408 /* syscall merging: Usually we must not sleep
409 * if O_NONBLOCK is set, or if we got some data.
410 * But if a writer sleeps in kernel space, then
411 * we can wait for that data without violating POSIX.
412 */
413 if (ret)
414 break;
415 if (filp->f_flags & O_NONBLOCK) {
416 ret = -EAGAIN;
417 break;
418 }
419 }
420 if (signal_pending(current)) {
341b446b
IM
421 if (!ret)
422 ret = -ERESTARTSYS;
1da177e4
LT
423 break;
424 }
425 if (do_wakeup) {
923f4f23
IM
426 wake_up_interruptible_sync(&pipe->wait);
427 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
1da177e4 428 }
923f4f23 429 pipe_wait(pipe);
1da177e4 430 }
9aeedfc4 431 mutex_unlock(&inode->i_mutex);
341b446b
IM
432
433 /* Signal writers asynchronously that there is more room. */
1da177e4 434 if (do_wakeup) {
71e20f18 435 wake_up_interruptible_sync(&pipe->wait);
923f4f23 436 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
1da177e4
LT
437 }
438 if (ret > 0)
439 file_accessed(filp);
440 return ret;
441}
442
443static ssize_t
ee0b3e67
BP
444pipe_write(struct kiocb *iocb, const struct iovec *_iov,
445 unsigned long nr_segs, loff_t ppos)
1da177e4 446{
ee0b3e67 447 struct file *filp = iocb->ki_filp;
0f7fc9e4 448 struct inode *inode = filp->f_path.dentry->d_inode;
923f4f23 449 struct pipe_inode_info *pipe;
1da177e4
LT
450 ssize_t ret;
451 int do_wakeup;
452 struct iovec *iov = (struct iovec *)_iov;
453 size_t total_len;
454 ssize_t chars;
455
456 total_len = iov_length(iov, nr_segs);
457 /* Null write succeeds. */
458 if (unlikely(total_len == 0))
459 return 0;
460
461 do_wakeup = 0;
462 ret = 0;
9aeedfc4 463 mutex_lock(&inode->i_mutex);
923f4f23 464 pipe = inode->i_pipe;
1da177e4 465
923f4f23 466 if (!pipe->readers) {
1da177e4
LT
467 send_sig(SIGPIPE, current, 0);
468 ret = -EPIPE;
469 goto out;
470 }
471
472 /* We try to merge small writes */
473 chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
923f4f23 474 if (pipe->nrbufs && chars != 0) {
341b446b 475 int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
35f3d14d 476 (pipe->buffers - 1);
923f4f23 477 struct pipe_buffer *buf = pipe->bufs + lastbuf;
d4c3cca9 478 const struct pipe_buf_operations *ops = buf->ops;
1da177e4 479 int offset = buf->offset + buf->len;
341b446b 480
1da177e4 481 if (ops->can_merge && offset + chars <= PAGE_SIZE) {
f6762b7a 482 int error, atomic = 1;
5274f052 483 void *addr;
5274f052 484
cac36bb0 485 error = ops->confirm(pipe, buf);
f84d7519 486 if (error)
5274f052 487 goto out;
f84d7519 488
f6762b7a
JA
489 iov_fault_in_pages_read(iov, chars);
490redo1:
491 addr = ops->map(pipe, buf, atomic);
5274f052 492 error = pipe_iov_copy_from_user(offset + addr, iov,
f6762b7a
JA
493 chars, atomic);
494 ops->unmap(pipe, buf, addr);
1da177e4
LT
495 ret = error;
496 do_wakeup = 1;
f6762b7a
JA
497 if (error) {
498 if (atomic) {
499 atomic = 0;
500 goto redo1;
501 }
1da177e4 502 goto out;
f6762b7a 503 }
1da177e4
LT
504 buf->len += chars;
505 total_len -= chars;
506 ret = chars;
507 if (!total_len)
508 goto out;
509 }
510 }
511
512 for (;;) {
513 int bufs;
341b446b 514
923f4f23 515 if (!pipe->readers) {
1da177e4 516 send_sig(SIGPIPE, current, 0);
341b446b
IM
517 if (!ret)
518 ret = -EPIPE;
1da177e4
LT
519 break;
520 }
923f4f23 521 bufs = pipe->nrbufs;
35f3d14d
JA
522 if (bufs < pipe->buffers) {
523 int newbuf = (pipe->curbuf + bufs) & (pipe->buffers-1);
923f4f23
IM
524 struct pipe_buffer *buf = pipe->bufs + newbuf;
525 struct page *page = pipe->tmp_page;
f6762b7a
JA
526 char *src;
527 int error, atomic = 1;
1da177e4
LT
528
529 if (!page) {
530 page = alloc_page(GFP_HIGHUSER);
531 if (unlikely(!page)) {
532 ret = ret ? : -ENOMEM;
533 break;
534 }
923f4f23 535 pipe->tmp_page = page;
1da177e4 536 }
341b446b 537 /* Always wake up, even if the copy fails. Otherwise
1da177e4
LT
538 * we lock up (O_NONBLOCK-)readers that sleep due to
539 * syscall merging.
540 * FIXME! Is this really true?
541 */
542 do_wakeup = 1;
543 chars = PAGE_SIZE;
544 if (chars > total_len)
545 chars = total_len;
546
f6762b7a
JA
547 iov_fault_in_pages_read(iov, chars);
548redo2:
549 if (atomic)
550 src = kmap_atomic(page, KM_USER0);
551 else
552 src = kmap(page);
553
554 error = pipe_iov_copy_from_user(src, iov, chars,
555 atomic);
556 if (atomic)
557 kunmap_atomic(src, KM_USER0);
558 else
559 kunmap(page);
560
1da177e4 561 if (unlikely(error)) {
f6762b7a
JA
562 if (atomic) {
563 atomic = 0;
564 goto redo2;
565 }
341b446b 566 if (!ret)
f6762b7a 567 ret = error;
1da177e4
LT
568 break;
569 }
570 ret += chars;
571
572 /* Insert it into the buffer array */
573 buf->page = page;
574 buf->ops = &anon_pipe_buf_ops;
575 buf->offset = 0;
576 buf->len = chars;
923f4f23
IM
577 pipe->nrbufs = ++bufs;
578 pipe->tmp_page = NULL;
1da177e4
LT
579
580 total_len -= chars;
581 if (!total_len)
582 break;
583 }
35f3d14d 584 if (bufs < pipe->buffers)
1da177e4
LT
585 continue;
586 if (filp->f_flags & O_NONBLOCK) {
341b446b
IM
587 if (!ret)
588 ret = -EAGAIN;
1da177e4
LT
589 break;
590 }
591 if (signal_pending(current)) {
341b446b
IM
592 if (!ret)
593 ret = -ERESTARTSYS;
1da177e4
LT
594 break;
595 }
596 if (do_wakeup) {
923f4f23
IM
597 wake_up_interruptible_sync(&pipe->wait);
598 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
1da177e4
LT
599 do_wakeup = 0;
600 }
923f4f23
IM
601 pipe->waiting_writers++;
602 pipe_wait(pipe);
603 pipe->waiting_writers--;
1da177e4
LT
604 }
605out:
9aeedfc4 606 mutex_unlock(&inode->i_mutex);
1da177e4 607 if (do_wakeup) {
71e20f18 608 wake_up_interruptible_sync(&pipe->wait);
923f4f23 609 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
1da177e4
LT
610 }
611 if (ret > 0)
870f4817 612 file_update_time(filp);
1da177e4
LT
613 return ret;
614}
615
1da177e4
LT
616static ssize_t
617bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
618{
619 return -EBADF;
620}
621
622static ssize_t
341b446b
IM
623bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
624 loff_t *ppos)
1da177e4
LT
625{
626 return -EBADF;
627}
628
d59d0b1b 629static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
1da177e4 630{
0f7fc9e4 631 struct inode *inode = filp->f_path.dentry->d_inode;
923f4f23 632 struct pipe_inode_info *pipe;
1da177e4
LT
633 int count, buf, nrbufs;
634
635 switch (cmd) {
636 case FIONREAD:
9aeedfc4 637 mutex_lock(&inode->i_mutex);
923f4f23 638 pipe = inode->i_pipe;
1da177e4 639 count = 0;
923f4f23
IM
640 buf = pipe->curbuf;
641 nrbufs = pipe->nrbufs;
1da177e4 642 while (--nrbufs >= 0) {
923f4f23 643 count += pipe->bufs[buf].len;
35f3d14d 644 buf = (buf+1) & (pipe->buffers - 1);
1da177e4 645 }
9aeedfc4 646 mutex_unlock(&inode->i_mutex);
923f4f23 647
1da177e4
LT
648 return put_user(count, (int __user *)arg);
649 default:
650 return -EINVAL;
651 }
652}
653
654/* No kernel lock held - fine */
655static unsigned int
656pipe_poll(struct file *filp, poll_table *wait)
657{
658 unsigned int mask;
0f7fc9e4 659 struct inode *inode = filp->f_path.dentry->d_inode;
923f4f23 660 struct pipe_inode_info *pipe = inode->i_pipe;
1da177e4
LT
661 int nrbufs;
662
923f4f23 663 poll_wait(filp, &pipe->wait, wait);
1da177e4
LT
664
665 /* Reading only -- no need for acquiring the semaphore. */
923f4f23 666 nrbufs = pipe->nrbufs;
1da177e4
LT
667 mask = 0;
668 if (filp->f_mode & FMODE_READ) {
669 mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
923f4f23 670 if (!pipe->writers && filp->f_version != pipe->w_counter)
1da177e4
LT
671 mask |= POLLHUP;
672 }
673
674 if (filp->f_mode & FMODE_WRITE) {
35f3d14d 675 mask |= (nrbufs < pipe->buffers) ? POLLOUT | POLLWRNORM : 0;
5e5d7a22
PE
676 /*
677 * Most Unices do not set POLLERR for FIFOs but on Linux they
678 * behave exactly like pipes for poll().
679 */
923f4f23 680 if (!pipe->readers)
1da177e4
LT
681 mask |= POLLERR;
682 }
683
684 return mask;
685}
686
1da177e4
LT
687static int
688pipe_release(struct inode *inode, int decr, int decw)
689{
923f4f23
IM
690 struct pipe_inode_info *pipe;
691
9aeedfc4 692 mutex_lock(&inode->i_mutex);
923f4f23
IM
693 pipe = inode->i_pipe;
694 pipe->readers -= decr;
695 pipe->writers -= decw;
341b446b 696
923f4f23 697 if (!pipe->readers && !pipe->writers) {
1da177e4
LT
698 free_pipe_info(inode);
699 } else {
71e20f18 700 wake_up_interruptible_sync(&pipe->wait);
923f4f23
IM
701 kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
702 kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
1da177e4 703 }
9aeedfc4 704 mutex_unlock(&inode->i_mutex);
1da177e4
LT
705
706 return 0;
707}
708
709static int
710pipe_read_fasync(int fd, struct file *filp, int on)
711{
0f7fc9e4 712 struct inode *inode = filp->f_path.dentry->d_inode;
1da177e4
LT
713 int retval;
714
9aeedfc4
IM
715 mutex_lock(&inode->i_mutex);
716 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
717 mutex_unlock(&inode->i_mutex);
1da177e4 718
60aa4924 719 return retval;
1da177e4
LT
720}
721
722
723static int
724pipe_write_fasync(int fd, struct file *filp, int on)
725{
0f7fc9e4 726 struct inode *inode = filp->f_path.dentry->d_inode;
1da177e4
LT
727 int retval;
728
9aeedfc4
IM
729 mutex_lock(&inode->i_mutex);
730 retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
731 mutex_unlock(&inode->i_mutex);
1da177e4 732
60aa4924 733 return retval;
1da177e4
LT
734}
735
736
737static int
738pipe_rdwr_fasync(int fd, struct file *filp, int on)
739{
0f7fc9e4 740 struct inode *inode = filp->f_path.dentry->d_inode;
341b446b 741 struct pipe_inode_info *pipe = inode->i_pipe;
1da177e4
LT
742 int retval;
743
9aeedfc4 744 mutex_lock(&inode->i_mutex);
341b446b 745 retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
e5bc49ba 746 if (retval >= 0) {
341b446b 747 retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
e5bc49ba
ON
748 if (retval < 0) /* this can happen only if on == T */
749 fasync_helper(-1, filp, 0, &pipe->fasync_readers);
750 }
9aeedfc4 751 mutex_unlock(&inode->i_mutex);
60aa4924 752 return retval;
1da177e4
LT
753}
754
755
756static int
757pipe_read_release(struct inode *inode, struct file *filp)
758{
1da177e4
LT
759 return pipe_release(inode, 1, 0);
760}
761
762static int
763pipe_write_release(struct inode *inode, struct file *filp)
764{
1da177e4
LT
765 return pipe_release(inode, 0, 1);
766}
767
768static int
769pipe_rdwr_release(struct inode *inode, struct file *filp)
770{
771 int decr, decw;
772
1da177e4
LT
773 decr = (filp->f_mode & FMODE_READ) != 0;
774 decw = (filp->f_mode & FMODE_WRITE) != 0;
775 return pipe_release(inode, decr, decw);
776}
777
778static int
779pipe_read_open(struct inode *inode, struct file *filp)
780{
ad396024
EC
781 int ret = -ENOENT;
782
9aeedfc4 783 mutex_lock(&inode->i_mutex);
ad396024
EC
784
785 if (inode->i_pipe) {
786 ret = 0;
787 inode->i_pipe->readers++;
788 }
789
9aeedfc4 790 mutex_unlock(&inode->i_mutex);
1da177e4 791
ad396024 792 return ret;
1da177e4
LT
793}
794
795static int
796pipe_write_open(struct inode *inode, struct file *filp)
797{
ad396024
EC
798 int ret = -ENOENT;
799
9aeedfc4 800 mutex_lock(&inode->i_mutex);
ad396024
EC
801
802 if (inode->i_pipe) {
803 ret = 0;
804 inode->i_pipe->writers++;
805 }
806
9aeedfc4 807 mutex_unlock(&inode->i_mutex);
1da177e4 808
ad396024 809 return ret;
1da177e4
LT
810}
811
812static int
813pipe_rdwr_open(struct inode *inode, struct file *filp)
814{
ad396024
EC
815 int ret = -ENOENT;
816
9aeedfc4 817 mutex_lock(&inode->i_mutex);
ad396024
EC
818
819 if (inode->i_pipe) {
820 ret = 0;
821 if (filp->f_mode & FMODE_READ)
822 inode->i_pipe->readers++;
823 if (filp->f_mode & FMODE_WRITE)
824 inode->i_pipe->writers++;
825 }
826
9aeedfc4 827 mutex_unlock(&inode->i_mutex);
1da177e4 828
ad396024 829 return ret;
1da177e4
LT
830}
831
832/*
833 * The file_operations structs are not static because they
834 * are also used in linux/fs/fifo.c to do operations on FIFOs.
d2d9648e
DV
835 *
836 * Pipes reuse fifos' file_operations structs.
1da177e4 837 */
d2d9648e 838const struct file_operations read_pipefifo_fops = {
1da177e4 839 .llseek = no_llseek,
ee0b3e67
BP
840 .read = do_sync_read,
841 .aio_read = pipe_read,
1da177e4
LT
842 .write = bad_pipe_w,
843 .poll = pipe_poll,
d59d0b1b 844 .unlocked_ioctl = pipe_ioctl,
1da177e4
LT
845 .open = pipe_read_open,
846 .release = pipe_read_release,
847 .fasync = pipe_read_fasync,
848};
849
d2d9648e 850const struct file_operations write_pipefifo_fops = {
1da177e4
LT
851 .llseek = no_llseek,
852 .read = bad_pipe_r,
ee0b3e67
BP
853 .write = do_sync_write,
854 .aio_write = pipe_write,
1da177e4 855 .poll = pipe_poll,
d59d0b1b 856 .unlocked_ioctl = pipe_ioctl,
1da177e4
LT
857 .open = pipe_write_open,
858 .release = pipe_write_release,
859 .fasync = pipe_write_fasync,
860};
861
d2d9648e 862const struct file_operations rdwr_pipefifo_fops = {
1da177e4 863 .llseek = no_llseek,
ee0b3e67
BP
864 .read = do_sync_read,
865 .aio_read = pipe_read,
866 .write = do_sync_write,
867 .aio_write = pipe_write,
1da177e4 868 .poll = pipe_poll,
d59d0b1b 869 .unlocked_ioctl = pipe_ioctl,
1da177e4
LT
870 .open = pipe_rdwr_open,
871 .release = pipe_rdwr_release,
872 .fasync = pipe_rdwr_fasync,
873};
874
3a326a2c
IM
875struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
876{
923f4f23 877 struct pipe_inode_info *pipe;
3a326a2c 878
923f4f23
IM
879 pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
880 if (pipe) {
35f3d14d
JA
881 pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
882 if (pipe->bufs) {
883 init_waitqueue_head(&pipe->wait);
884 pipe->r_counter = pipe->w_counter = 1;
885 pipe->inode = inode;
886 pipe->buffers = PIPE_DEF_BUFFERS;
887 return pipe;
888 }
889 kfree(pipe);
3a326a2c
IM
890 }
891
35f3d14d 892 return NULL;
3a326a2c
IM
893}
894
923f4f23 895void __free_pipe_info(struct pipe_inode_info *pipe)
1da177e4
LT
896{
897 int i;
1da177e4 898
35f3d14d 899 for (i = 0; i < pipe->buffers; i++) {
923f4f23 900 struct pipe_buffer *buf = pipe->bufs + i;
1da177e4 901 if (buf->ops)
923f4f23 902 buf->ops->release(pipe, buf);
1da177e4 903 }
923f4f23
IM
904 if (pipe->tmp_page)
905 __free_page(pipe->tmp_page);
35f3d14d 906 kfree(pipe->bufs);
923f4f23 907 kfree(pipe);
1da177e4
LT
908}
909
b92ce558
JA
910void free_pipe_info(struct inode *inode)
911{
912 __free_pipe_info(inode->i_pipe);
913 inode->i_pipe = NULL;
914}
915
fa3536cc 916static struct vfsmount *pipe_mnt __read_mostly;
341b446b 917
c23fbb6b
ED
918/*
919 * pipefs_dname() is called from d_path().
920 */
921static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
922{
923 return dynamic_dname(dentry, buffer, buflen, "pipe:[%lu]",
924 dentry->d_inode->i_ino);
925}
926
3ba13d17 927static const struct dentry_operations pipefs_dentry_operations = {
c23fbb6b 928 .d_dname = pipefs_dname,
1da177e4
LT
929};
930
931static struct inode * get_pipe_inode(void)
932{
933 struct inode *inode = new_inode(pipe_mnt->mnt_sb);
923f4f23 934 struct pipe_inode_info *pipe;
1da177e4
LT
935
936 if (!inode)
937 goto fail_inode;
938
923f4f23
IM
939 pipe = alloc_pipe_info(inode);
940 if (!pipe)
1da177e4 941 goto fail_iput;
923f4f23 942 inode->i_pipe = pipe;
3a326a2c 943
923f4f23 944 pipe->readers = pipe->writers = 1;
d2d9648e 945 inode->i_fop = &rdwr_pipefifo_fops;
1da177e4
LT
946
947 /*
948 * Mark the inode dirty from the very beginning,
949 * that way it will never be moved to the dirty
950 * list because "mark_inode_dirty()" will think
951 * that it already _is_ on the dirty list.
952 */
953 inode->i_state = I_DIRTY;
954 inode->i_mode = S_IFIFO | S_IRUSR | S_IWUSR;
da9592ed
DH
955 inode->i_uid = current_fsuid();
956 inode->i_gid = current_fsgid();
1da177e4 957 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
923f4f23 958
1da177e4
LT
959 return inode;
960
961fail_iput:
962 iput(inode);
341b446b 963
1da177e4
LT
964fail_inode:
965 return NULL;
966}
967
be61a86d 968struct file *create_write_pipe(int flags)
1da177e4 969{
d6cbd281
AK
970 int err;
971 struct inode *inode;
972 struct file *f;
2c48b9c4 973 struct path path;
c23fbb6b 974 struct qstr name = { .name = "" };
1da177e4 975
d6cbd281 976 err = -ENFILE;
1da177e4
LT
977 inode = get_pipe_inode();
978 if (!inode)
430e285e 979 goto err;
1da177e4 980
d6cbd281 981 err = -ENOMEM;
2c48b9c4
AV
982 path.dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &name);
983 if (!path.dentry)
d6cbd281 984 goto err_inode;
2c48b9c4 985 path.mnt = mntget(pipe_mnt);
341b446b 986
2c48b9c4 987 path.dentry->d_op = &pipefs_dentry_operations;
2c48b9c4 988 d_instantiate(path.dentry, inode);
430e285e
DH
989
990 err = -ENFILE;
2c48b9c4 991 f = alloc_file(&path, FMODE_WRITE, &write_pipefifo_fops);
430e285e
DH
992 if (!f)
993 goto err_dentry;
d6cbd281 994 f->f_mapping = inode->i_mapping;
341b446b 995
be61a86d 996 f->f_flags = O_WRONLY | (flags & O_NONBLOCK);
d6cbd281
AK
997 f->f_version = 0;
998
999 return f;
1da177e4 1000
430e285e 1001 err_dentry:
ed152437 1002 free_pipe_info(inode);
2c48b9c4 1003 path_put(&path);
ed152437
AV
1004 return ERR_PTR(err);
1005
d6cbd281 1006 err_inode:
1da177e4
LT
1007 free_pipe_info(inode);
1008 iput(inode);
430e285e 1009 err:
d6cbd281
AK
1010 return ERR_PTR(err);
1011}
1012
1013void free_write_pipe(struct file *f)
1014{
5ccac88e 1015 free_pipe_info(f->f_dentry->d_inode);
c8e7f449 1016 path_put(&f->f_path);
d6cbd281
AK
1017 put_filp(f);
1018}
1019
be61a86d 1020struct file *create_read_pipe(struct file *wrf, int flags)
d6cbd281 1021{
d231412d
AV
1022 /* Grab pipe from the writer */
1023 struct file *f = alloc_file(&wrf->f_path, FMODE_READ,
1024 &read_pipefifo_fops);
d6cbd281
AK
1025 if (!f)
1026 return ERR_PTR(-ENFILE);
1027
c8e7f449 1028 path_get(&wrf->f_path);
be61a86d 1029 f->f_flags = O_RDONLY | (flags & O_NONBLOCK);
d6cbd281
AK
1030
1031 return f;
1032}
1033
ed8cae8b 1034int do_pipe_flags(int *fd, int flags)
d6cbd281
AK
1035{
1036 struct file *fw, *fr;
1037 int error;
1038 int fdw, fdr;
1039
be61a86d 1040 if (flags & ~(O_CLOEXEC | O_NONBLOCK))
ed8cae8b
UD
1041 return -EINVAL;
1042
be61a86d 1043 fw = create_write_pipe(flags);
d6cbd281
AK
1044 if (IS_ERR(fw))
1045 return PTR_ERR(fw);
be61a86d 1046 fr = create_read_pipe(fw, flags);
d6cbd281
AK
1047 error = PTR_ERR(fr);
1048 if (IS_ERR(fr))
1049 goto err_write_pipe;
1050
ed8cae8b 1051 error = get_unused_fd_flags(flags);
d6cbd281
AK
1052 if (error < 0)
1053 goto err_read_pipe;
1054 fdr = error;
1055
ed8cae8b 1056 error = get_unused_fd_flags(flags);
d6cbd281
AK
1057 if (error < 0)
1058 goto err_fdr;
1059 fdw = error;
1060
157cf649 1061 audit_fd_pair(fdr, fdw);
d6cbd281
AK
1062 fd_install(fdr, fr);
1063 fd_install(fdw, fw);
1064 fd[0] = fdr;
1065 fd[1] = fdw;
1066
1067 return 0;
1068
1069 err_fdr:
1070 put_unused_fd(fdr);
1071 err_read_pipe:
c8e7f449 1072 path_put(&fr->f_path);
d6cbd281
AK
1073 put_filp(fr);
1074 err_write_pipe:
1075 free_write_pipe(fw);
1076 return error;
1da177e4
LT
1077}
1078
d35c7b0e
UD
1079/*
1080 * sys_pipe() is the normal C calling standard for creating
1081 * a pipe. It's not the way Unix traditionally does this, though.
1082 */
d4e82042 1083SYSCALL_DEFINE2(pipe2, int __user *, fildes, int, flags)
d35c7b0e
UD
1084{
1085 int fd[2];
1086 int error;
1087
ed8cae8b 1088 error = do_pipe_flags(fd, flags);
d35c7b0e 1089 if (!error) {
ba719bae
UD
1090 if (copy_to_user(fildes, fd, sizeof(fd))) {
1091 sys_close(fd[0]);
1092 sys_close(fd[1]);
d35c7b0e 1093 error = -EFAULT;
ba719bae 1094 }
d35c7b0e
UD
1095 }
1096 return error;
1097}
1098
2b664219 1099SYSCALL_DEFINE1(pipe, int __user *, fildes)
ed8cae8b
UD
1100{
1101 return sys_pipe2(fildes, 0);
1102}
1103
35f3d14d
JA
1104/*
1105 * Allocate a new array of pipe buffers and copy the info over. Returns the
1106 * pipe size if successful, or return -ERROR on error.
1107 */
1108static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
1109{
1110 struct pipe_buffer *bufs;
1111
1112 /*
1113 * Must be a power-of-2 currently
1114 */
1115 if (!is_power_of_2(arg))
1116 return -EINVAL;
1117
1118 /*
1119 * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
1120 * expect a lot of shrink+grow operations, just free and allocate
1121 * again like we would do for growing. If the pipe currently
1122 * contains more buffers than arg, then return busy.
1123 */
1124 if (arg < pipe->nrbufs)
1125 return -EBUSY;
1126
1127 bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
1128 if (unlikely(!bufs))
1129 return -ENOMEM;
1130
1131 /*
1132 * The pipe array wraps around, so just start the new one at zero
1133 * and adjust the indexes.
1134 */
1135 if (pipe->nrbufs) {
1136 const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
1137 const unsigned int head = pipe->nrbufs - tail;
1138
1139 if (head)
1140 memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
1141 if (tail)
1142 memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
1143 }
1144
1145 pipe->curbuf = 0;
1146 kfree(pipe->bufs);
1147 pipe->bufs = bufs;
1148 pipe->buffers = arg;
1149 return arg;
1150}
1151
1152long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
1153{
1154 struct pipe_inode_info *pipe;
1155 long ret;
1156
1157 pipe = file->f_path.dentry->d_inode->i_pipe;
1158 if (!pipe)
1159 return -EBADF;
1160
1161 mutex_lock(&pipe->inode->i_mutex);
1162
1163 switch (cmd) {
1164 case F_SETPIPE_SZ:
1165 ret = pipe_set_size(pipe, arg);
1166 break;
1167 case F_GETPIPE_SZ:
1168 ret = pipe->buffers;
1169 break;
1170 default:
1171 ret = -EINVAL;
1172 break;
1173 }
1174
1175 mutex_unlock(&pipe->inode->i_mutex);
1176 return ret;
1177}
1178
1da177e4
LT
1179/*
1180 * pipefs should _never_ be mounted by userland - too much of security hassle,
1181 * no real gain from having the whole whorehouse mounted. So we don't need
1182 * any operations on the root directory. However, we need a non-trivial
1183 * d_name - pipe: will go nicely and kill the special-casing in procfs.
1184 */
454e2398
DH
1185static int pipefs_get_sb(struct file_system_type *fs_type,
1186 int flags, const char *dev_name, void *data,
1187 struct vfsmount *mnt)
1da177e4 1188{
454e2398 1189 return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC, mnt);
1da177e4
LT
1190}
1191
1192static struct file_system_type pipe_fs_type = {
1193 .name = "pipefs",
1194 .get_sb = pipefs_get_sb,
1195 .kill_sb = kill_anon_super,
1196};
1197
1198static int __init init_pipe_fs(void)
1199{
1200 int err = register_filesystem(&pipe_fs_type);
341b446b 1201
1da177e4
LT
1202 if (!err) {
1203 pipe_mnt = kern_mount(&pipe_fs_type);
1204 if (IS_ERR(pipe_mnt)) {
1205 err = PTR_ERR(pipe_mnt);
1206 unregister_filesystem(&pipe_fs_type);
1207 }
1208 }
1209 return err;
1210}
1211
1212static void __exit exit_pipe_fs(void)
1213{
1214 unregister_filesystem(&pipe_fs_type);
1215 mntput(pipe_mnt);
1216}
1217
1218fs_initcall(init_pipe_fs);
1219module_exit(exit_pipe_fs);