[net-next-2.6.git] / fs / proc / generic.c

/*
 * proc/fs/generic.c --- generic routines for the proc-fs
 *
 * This file contains generic proc-fs routines for handling
 * directories and files.
 * 
 * Copyright (C) 1991, 1992 Linus Torvalds.
 * Copyright (C) 1997 Theodore Ts'o
 */

#include <linux/errno.h>
#include <linux/time.h>
#include <linux/proc_fs.h>
#include <linux/stat.h>
#include <linux/module.h>
#include <linux/mount.h>
#include <linux/smp_lock.h>
#include <linux/init.h>
#include <linux/idr.h>
#include <linux/namei.h>
#include <linux/bitops.h>
#include <linux/spinlock.h>
#include <asm/uaccess.h>

#include "internal.h"

static ssize_t proc_file_read(struct file *file, char __user *buf,
			      size_t nbytes, loff_t *ppos);
static ssize_t proc_file_write(struct file *file, const char __user *buffer,
			       size_t count, loff_t *ppos);
static loff_t proc_file_lseek(struct file *, loff_t, int);

DEFINE_SPINLOCK(proc_subdir_lock);

int proc_match(int len, const char *name, struct proc_dir_entry *de)
{
	if (de->namelen != len)
		return 0;
	return !memcmp(name, de->name, len);
}

static struct file_operations proc_file_operations = {
	.llseek		= proc_file_lseek,
	.read		= proc_file_read,
	.write		= proc_file_write,
};

/* buffer size is one page but our output routines use some slack for overruns */
#define PROC_BLOCK_SIZE	(PAGE_SIZE - 1024)

static ssize_t
proc_file_read(struct file *file, char __user *buf, size_t nbytes,
	       loff_t *ppos)
{
	struct inode * inode = file->f_dentry->d_inode;
	char 	*page;
	ssize_t	retval=0;
	int	eof=0;
	ssize_t	n, count;
	char	*start;
	struct proc_dir_entry * dp;
	unsigned long long pos;

	/*
	 * Gaah, please just use "seq_file" instead. The legacy /proc
	 * interfaces cut loff_t down to off_t for reads, and ignore
	 * the offset entirely for writes..
	 */
	pos = *ppos;
	if (pos > MAX_NON_LFS)
		return 0;
	if (nbytes > MAX_NON_LFS - pos)
		nbytes = MAX_NON_LFS - pos;

	dp = PDE(inode);
	if (!(page = (char*) __get_free_page(GFP_KERNEL)))
		return -ENOMEM;

	while ((nbytes > 0) && !eof) {
		count = min_t(size_t, PROC_BLOCK_SIZE, nbytes);

		start = NULL;
		if (dp->get_info) {
			/* Handle old net routines */
			n = dp->get_info(page, &start, *ppos, count);
			if (n < count)
				eof = 1;
		} else if (dp->read_proc) {
			/*
			 * How to be a proc read function
			 * ------------------------------
			 * Prototype:
			 *    int f(char *buffer, char **start, off_t offset,
			 *          int count, int *peof, void *dat)
			 *
			 * Assume that the buffer is "count" bytes in size.
			 *
			 * If you know you have supplied all the data you
			 * have, set *peof.
			 *
			 * You have three ways to return data:
			 * 0) Leave *start = NULL.  (This is the default.)
			 *    Put the data of the requested offset at that
			 *    offset within the buffer.  Return the number (n)
			 *    of bytes there are from the beginning of the
			 *    buffer up to the last byte of data.  If the
			 *    number of supplied bytes (= n - offset) is 
			 *    greater than zero and you didn't signal eof
			 *    and the reader is prepared to take more data
			 *    you will be called again with the requested
			 *    offset advanced by the number of bytes 
			 *    absorbed.  This interface is useful for files
			 *    no larger than the buffer.
			 * 1) Set *start = an unsigned long value less than
			 *    the buffer address but greater than zero.
			 *    Put the data of the requested offset at the
			 *    beginning of the buffer.  Return the number of
			 *    bytes of data placed there.  If this number is
			 *    greater than zero and you didn't signal eof
			 *    and the reader is prepared to take more data
			 *    you will be called again with the requested
			 *    offset advanced by *start.  This interface is
			 *    useful when you have a large file consisting
			 *    of a series of blocks which you want to count
			 *    and return as wholes.
			 *    (Hack by Paul.Russell@rustcorp.com.au)
			 * 2) Set *start = an address within the buffer.
			 *    Put the data of the requested offset at *start.
			 *    Return the number of bytes of data placed there.
			 *    If this number is greater than zero and you
			 *    didn't signal eof and the reader is prepared to
			 *    take more data you will be called again with the
			 *    requested offset advanced by the number of bytes
			 *    absorbed.
			 */
			n = dp->read_proc(page, &start, *ppos,
					  count, &eof, dp->data);
		} else
			break;

		if (n == 0)   /* end of file */
			break;
		if (n < 0) {  /* error */
			if (retval == 0)
				retval = n;
			break;
		}

		if (start == NULL) {
			if (n > PAGE_SIZE) {
				printk(KERN_ERR
				       "proc_file_read: Apparent buffer overflow!\n");
				n = PAGE_SIZE;
			}
			n -= *ppos;
			if (n <= 0)
				break;
			if (n > count)
				n = count;
			start = page + *ppos;
		} else if (start < page) {
			if (n > PAGE_SIZE) {
				printk(KERN_ERR
				       "proc_file_read: Apparent buffer overflow!\n");
				n = PAGE_SIZE;
			}
			if (n > count) {
				/*
				 * Don't reduce n because doing so might
				 * cut off part of a data block.
				 */
				printk(KERN_WARNING
				       "proc_file_read: Read count exceeded\n");
			}
		} else /* start >= page */ {
			unsigned long startoff = (unsigned long)(start - page);
			if (n > (PAGE_SIZE - startoff)) {
				printk(KERN_ERR
				       "proc_file_read: Apparent buffer overflow!\n");
				n = PAGE_SIZE - startoff;
			}
			if (n > count)
				n = count;
		}
		
 		n -= copy_to_user(buf, start < page ? page : start, n);
		if (n == 0) {
			if (retval == 0)
				retval = -EFAULT;
			break;
		}

		*ppos += start < page ? (unsigned long)start : n;
		nbytes -= n;
		buf += n;
		retval += n;
	}
	free_page((unsigned long) page);
	return retval;
}

static ssize_t
proc_file_write(struct file *file, const char __user *buffer,
		size_t count, loff_t *ppos)
{
	struct inode *inode = file->f_dentry->d_inode;
	struct proc_dir_entry * dp;
	
	dp = PDE(inode);

	if (!dp->write_proc)
		return -EIO;

	/* FIXME: does this routine need ppos?  probably... */
	return dp->write_proc(file, buffer, count, dp->data);
}


static loff_t
proc_file_lseek(struct file *file, loff_t offset, int orig)
{
	loff_t retval = -EINVAL;
	switch (orig) {
	case 1:
		offset += file->f_pos;
	/* fallthrough */
	case 0:
		if (offset < 0 || offset > MAX_NON_LFS)
			break;
		file->f_pos = retval = offset;
	}
	return retval;
}

static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
{
	struct inode *inode = dentry->d_inode;
	struct proc_dir_entry *de = PDE(inode);
	int error;

	error = inode_change_ok(inode, iattr);
	if (error)
		goto out;

	error = inode_setattr(inode, iattr);
	if (error)
		goto out;
	
	de->uid = inode->i_uid;
	de->gid = inode->i_gid;
	de->mode = inode->i_mode;
out:
	return error;
}

static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
			struct kstat *stat)
{
	struct inode *inode = dentry->d_inode;
	struct proc_dir_entry *de = PROC_I(inode)->pde;
	if (de && de->nlink)
		inode->i_nlink = de->nlink;

	generic_fillattr(inode, stat);
	return 0;
}

static struct inode_operations proc_file_inode_operations = {
	.setattr	= proc_notify_change,
};

/*
 * This function parses a name such as "tty/driver/serial", and
 * returns the struct proc_dir_entry for "/proc/tty/driver", and
 * returns "serial" in residual.
 */
static int xlate_proc_name(const char *name,
			   struct proc_dir_entry **ret, const char **residual)
{
	const char     		*cp = name, *next;
	struct proc_dir_entry	*de;
	int			len;
	int 			rtn = 0;

	spin_lock(&proc_subdir_lock);
	de = &proc_root;
	while (1) {
		next = strchr(cp, '/');
		if (!next)
			break;

		len = next - cp;
		for (de = de->subdir; de ; de = de->next) {
			if (proc_match(len, cp, de))
				break;
		}
		if (!de) {
			rtn = -ENOENT;
			goto out;
		}
		cp += len + 1;
	}
	*residual = cp;
	*ret = de;
out:
	spin_unlock(&proc_subdir_lock);
	return rtn;
}

static DEFINE_IDR(proc_inum_idr);
static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */

#define PROC_DYNAMIC_FIRST 0xF0000000UL

/*
 * Return an inode number between PROC_DYNAMIC_FIRST and
 * 0xffffffff, or zero on failure.
 */
static unsigned int get_inode_number(void)
{
	int i, inum = 0;
	int error;

retry:
	if (idr_pre_get(&proc_inum_idr, GFP_KERNEL) == 0)
		return 0;

	spin_lock(&proc_inum_lock);
	error = idr_get_new(&proc_inum_idr, NULL, &i);
	spin_unlock(&proc_inum_lock);
	if (error == -EAGAIN)
		goto retry;
	else if (error)
		return 0;

	inum = (i & MAX_ID_MASK) + PROC_DYNAMIC_FIRST;

	/* inum will never be more than 0xf0ffffff, so no check
	 * for overflow.
	 */

	return inum;
}

static void release_inode_number(unsigned int inum)
{
	int id = (inum - PROC_DYNAMIC_FIRST) | ~MAX_ID_MASK;

	spin_lock(&proc_inum_lock);
	idr_remove(&proc_inum_idr, id);
	spin_unlock(&proc_inum_lock);
}

static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd)
{
	nd_set_link(nd, PDE(dentry->d_inode)->data);
	return NULL;
}

static struct inode_operations proc_link_inode_operations = {
	.readlink	= generic_readlink,
	.follow_link	= proc_follow_link,
};

/*
 * As some entries in /proc are volatile, we want to 
 * get rid of unused dentries.  This could be made 
 * smarter: we could keep a "volatile" flag in the 
 * inode to indicate which ones to keep.
 */
static int proc_delete_dentry(struct dentry * dentry)
{
	return 1;
}

static struct dentry_operations proc_dentry_operations =
{
	.d_delete	= proc_delete_dentry,
};

/*
 * Don't create negative dentries here, return -ENOENT by hand
 * instead.
 */
struct dentry *proc_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
{
	struct inode *inode = NULL;
	struct proc_dir_entry * de;
	int error = -ENOENT;

	lock_kernel();
	spin_lock(&proc_subdir_lock);
	de = PDE(dir);
	if (de) {
		for (de = de->subdir; de ; de = de->next) {
			if (de->namelen != dentry->d_name.len)
				continue;
			if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
				unsigned int ino = de->low_ino;

				spin_unlock(&proc_subdir_lock);
				error = -EINVAL;
				inode = proc_get_inode(dir->i_sb, ino, de);
				spin_lock(&proc_subdir_lock);
				break;
			}
		}
	}
	spin_unlock(&proc_subdir_lock);
	unlock_kernel();

	if (inode) {
		dentry->d_op = &proc_dentry_operations;
		d_add(dentry, inode);
		return NULL;
	}
	return ERR_PTR(error);
}

/*
 * This returns non-zero if at EOF, so that the /proc
 * root directory can use this and check if it should
 * continue with the <pid> entries..
 *
 * Note that the VFS-layer doesn't care about the return
 * value of the readdir() call, as long as it's non-negative
 * for success..
 */
int proc_readdir(struct file * filp,
	void * dirent, filldir_t filldir)
{
	struct proc_dir_entry * de;
	unsigned int ino;
	int i;
	struct inode *inode = filp->f_dentry->d_inode;
	int ret = 0;

	lock_kernel();

	ino = inode->i_ino;
	de = PDE(inode);
	if (!de) {
		ret = -EINVAL;
		goto out;
	}
	i = filp->f_pos;
	switch (i) {
		case 0:
			if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
				goto out;
			i++;
			filp->f_pos++;
			/* fall through */
		case 1:
			if (filldir(dirent, "..", 2, i,
				    parent_ino(filp->f_dentry),
				    DT_DIR) < 0)
				goto out;
			i++;
			filp->f_pos++;
			/* fall through */
		default:
			spin_lock(&proc_subdir_lock);
			de = de->subdir;
			i -= 2;
			for (;;) {
				if (!de) {
					ret = 1;
					spin_unlock(&proc_subdir_lock);
					goto out;
				}
				if (!i)
					break;
				de = de->next;
				i--;
			}

			do {
				/* filldir passes info to user space */
				spin_unlock(&proc_subdir_lock);
				if (filldir(dirent, de->name, de->namelen, filp->f_pos,
					    de->low_ino, de->mode >> 12) < 0)
					goto out;
				spin_lock(&proc_subdir_lock);
				filp->f_pos++;
				de = de->next;
			} while (de);
			spin_unlock(&proc_subdir_lock);
	}
	ret = 1;
out:	unlock_kernel();
	return ret;	
}

/*
 * These are the generic /proc directory operations. They
 * use the in-memory "struct proc_dir_entry" tree to parse
 * the /proc directory.
 */
static struct file_operations proc_dir_operations = {
	.read			= generic_read_dir,
	.readdir		= proc_readdir,
};

/*
 * proc directories can do almost nothing..
 */
static struct inode_operations proc_dir_inode_operations = {
	.lookup		= proc_lookup,
	.getattr	= proc_getattr,
	.setattr	= proc_notify_change,
};

static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
{
	unsigned int i;
	
	i = get_inode_number();
	if (i == 0)
		return -EAGAIN;
	dp->low_ino = i;

	spin_lock(&proc_subdir_lock);
	dp->next = dir->subdir;
	dp->parent = dir;
	dir->subdir = dp;
	spin_unlock(&proc_subdir_lock);

	if (S_ISDIR(dp->mode)) {
		if (dp->proc_iops == NULL) {
			dp->proc_fops = &proc_dir_operations;
			dp->proc_iops = &proc_dir_inode_operations;
		}
		dir->nlink++;
	} else if (S_ISLNK(dp->mode)) {
		if (dp->proc_iops == NULL)
			dp->proc_iops = &proc_link_inode_operations;
	} else if (S_ISREG(dp->mode)) {
		if (dp->proc_fops == NULL)
			dp->proc_fops = &proc_file_operations;
		if (dp->proc_iops == NULL)
			dp->proc_iops = &proc_file_inode_operations;
	}
	return 0;
}

/*
 * Kill an inode that got unregistered..
 */
static void proc_kill_inodes(struct proc_dir_entry *de)
{
	struct list_head *p;
	struct super_block *sb = proc_mnt->mnt_sb;

	/*
	 * Actually it's a partial revoke().
	 */
	file_list_lock();
	list_for_each(p, &sb->s_files) {
		struct file * filp = list_entry(p, struct file, f_u.fu_list);
		struct dentry * dentry = filp->f_dentry;
		struct inode * inode;
		const struct file_operations *fops;

		if (dentry->d_op != &proc_dentry_operations)
			continue;
		inode = dentry->d_inode;
		if (PDE(inode) != de)
			continue;
		fops = filp->f_op;
		filp->f_op = NULL;
		fops_put(fops);
	}
	file_list_unlock();
}

static struct proc_dir_entry *proc_create(struct proc_dir_entry **parent,
					  const char *name,
					  mode_t mode,
					  nlink_t nlink)
{
	struct proc_dir_entry *ent = NULL;
	const char *fn = name;
	int len;

	/* make sure name is valid */
	if (!name || !strlen(name)) goto out;

	if (!(*parent) && xlate_proc_name(name, parent, &fn) != 0)
		goto out;

	/* At this point there must not be any '/' characters beyond *fn */
	if (strchr(fn, '/'))
		goto out;

	len = strlen(fn);

	ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
	if (!ent) goto out;

	memset(ent, 0, sizeof(struct proc_dir_entry));
	memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1);
	ent->name = ((char *) ent) + sizeof(*ent);
	ent->namelen = len;
	ent->mode = mode;
	ent->nlink = nlink;
 out:
	return ent;
}

struct proc_dir_entry *proc_symlink(const char *name,
		struct proc_dir_entry *parent, const char *dest)
{
	struct proc_dir_entry *ent;

	ent = proc_create(&parent,name,
			  (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);

	if (ent) {
		ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
		if (ent->data) {
			strcpy((char*)ent->data,dest);
			if (proc_register(parent, ent) < 0) {
				kfree(ent->data);
				kfree(ent);
				ent = NULL;
			}
		} else {
			kfree(ent);
			ent = NULL;
		}
	}
	return ent;
}

struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode,
		struct proc_dir_entry *parent)
{
	struct proc_dir_entry *ent;

	ent = proc_create(&parent, name, S_IFDIR | mode, 2);
	if (ent) {
		ent->proc_fops = &proc_dir_operations;
		ent->proc_iops = &proc_dir_inode_operations;

		if (proc_register(parent, ent) < 0) {
			kfree(ent);
			ent = NULL;
		}
	}
	return ent;
}

struct proc_dir_entry *proc_mkdir(const char *name,
		struct proc_dir_entry *parent)
{
	return proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent);
}

struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode,
					 struct proc_dir_entry *parent)
{
	struct proc_dir_entry *ent;
	nlink_t nlink;

	if (S_ISDIR(mode)) {
		if ((mode & S_IALLUGO) == 0)
			mode |= S_IRUGO | S_IXUGO;
		nlink = 2;
	} else {
		if ((mode & S_IFMT) == 0)
			mode |= S_IFREG;
		if ((mode & S_IALLUGO) == 0)
			mode |= S_IRUGO;
		nlink = 1;
	}

	ent = proc_create(&parent,name,mode,nlink);
	if (ent) {
		if (S_ISDIR(mode)) {
			ent->proc_fops = &proc_dir_operations;
			ent->proc_iops = &proc_dir_inode_operations;
		}
		if (proc_register(parent, ent) < 0) {
			kfree(ent);
			ent = NULL;
		}
	}
	return ent;
}

void free_proc_entry(struct proc_dir_entry *de)
{
	unsigned int ino = de->low_ino;

	if (ino < PROC_DYNAMIC_FIRST)
		return;

	release_inode_number(ino);

	if (S_ISLNK(de->mode) && de->data)
		kfree(de->data);
	kfree(de);
}

/*
 * Remove a /proc entry and free it if it's not currently in use.
 * If it is in use, we set the 'deleted' flag.
 */
void remove_proc_entry(const char *name, struct proc_dir_entry *parent)
{
	struct proc_dir_entry **p;
	struct proc_dir_entry *de;
	const char *fn = name;
	int len;

	if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
		goto out;
	len = strlen(fn);

	spin_lock(&proc_subdir_lock);
	for (p = &parent->subdir; *p; p=&(*p)->next ) {
		if (!proc_match(len, fn, *p))
			continue;
		de = *p;
		*p = de->next;
		de->next = NULL;
		if (S_ISDIR(de->mode))
			parent->nlink--;
		proc_kill_inodes(de);
		de->nlink = 0;
		WARN_ON(de->subdir);
		if (!atomic_read(&de->count))
			free_proc_entry(de);
		else {
			de->deleted = 1;
			printk("remove_proc_entry: %s/%s busy, count=%d\n",
				parent->name, de->name, atomic_read(&de->count));
		}
		break;
	}
	spin_unlock(&proc_subdir_lock);
out:
	return;
}
Commit	Line	Data
1da177e4 LT	1	/*
	2	* proc/fs/generic.c --- generic routines for the proc-fs
	3	*
	4	* This file contains generic proc-fs routines for handling
	5	* directories and files.
	6	*
	7	* Copyright (C) 1991, 1992 Linus Torvalds.
	8	* Copyright (C) 1997 Theodore Ts'o
	9	*/
	10
	11	#include <linux/errno.h>
	12	#include <linux/time.h>
	13	#include <linux/proc_fs.h>
	14	#include <linux/stat.h>
	15	#include <linux/module.h>
	16	#include <linux/mount.h>
	17	#include <linux/smp_lock.h>
	18	#include <linux/init.h>
	19	#include <linux/idr.h>
	20	#include <linux/namei.h>
	21	#include <linux/bitops.h>
64a07bd8	22	#include <linux/spinlock.h>
1da177e4 LT	23	#include <asm/uaccess.h>
1da177e4 LT	24
fee781e6 AB	25	#include "internal.h"
fee781e6 AB	26
1da177e4 LT	27	static ssize_t proc_file_read(struct file file, char __user buf,
	28	size_t nbytes, loff_t *ppos);
	29	static ssize_t proc_file_write(struct file file, const char __user buffer,
	30	size_t count, loff_t *ppos);
	31	static loff_t proc_file_lseek(struct file *, loff_t, int);
	32
64a07bd8 SR	33	DEFINE_SPINLOCK(proc_subdir_lock);
64a07bd8 SR	34
1da177e4 LT	35	int proc_match(int len, const char name, struct proc_dir_entry de)
	36	{
	37	if (de->namelen != len)
	38	return 0;
	39	return !memcmp(name, de->name, len);
	40	}
	41
	42	static struct file_operations proc_file_operations = {
	43	.llseek = proc_file_lseek,
	44	.read = proc_file_read,
	45	.write = proc_file_write,
	46	};
	47
	48	/* buffer size is one page but our output routines use some slack for overruns */
	49	#define PROC_BLOCK_SIZE (PAGE_SIZE - 1024)
	50
	51	static ssize_t
	52	proc_file_read(struct file file, char __user buf, size_t nbytes,
	53	loff_t *ppos)
	54	{
	55	struct inode * inode = file->f_dentry->d_inode;
	56	char *page;
	57	ssize_t retval=0;
	58	int eof=0;
	59	ssize_t n, count;
	60	char *start;
	61	struct proc_dir_entry * dp;
8b90db0d LT	62	unsigned long long pos;
	63
	64	/*
	65	* Gaah, please just use "seq_file" instead. The legacy /proc
	66	* interfaces cut loff_t down to off_t for reads, and ignore
	67	* the offset entirely for writes..
	68	*/
	69	pos = *ppos;
	70	if (pos > MAX_NON_LFS)
	71	return 0;
	72	if (nbytes > MAX_NON_LFS - pos)
	73	nbytes = MAX_NON_LFS - pos;
1da177e4 LT	74
	75	dp = PDE(inode);
	76	if (!(page = (char*) __get_free_page(GFP_KERNEL)))
	77	return -ENOMEM;
	78
	79	while ((nbytes > 0) && !eof) {
	80	count = min_t(size_t, PROC_BLOCK_SIZE, nbytes);
	81
	82	start = NULL;
	83	if (dp->get_info) {
	84	/* Handle old net routines */
	85	n = dp->get_info(page, &start, *ppos, count);
	86	if (n < count)
	87	eof = 1;
	88	} else if (dp->read_proc) {
	89	/*
	90	* How to be a proc read function
	91	* ------------------------------
	92	* Prototype:
	93	* int f(char buffer, char *start, off_t offset,
	94	* int count, int peof, void dat)
	95	*
	96	* Assume that the buffer is "count" bytes in size.
	97	*
	98	* If you know you have supplied all the data you
	99	* have, set *peof.
	100	*
	101	* You have three ways to return data:
	102	* 0) Leave *start = NULL. (This is the default.)
	103	* Put the data of the requested offset at that
	104	* offset within the buffer. Return the number (n)
	105	* of bytes there are from the beginning of the
	106	* buffer up to the last byte of data. If the
	107	* number of supplied bytes (= n - offset) is
	108	* greater than zero and you didn't signal eof
	109	* and the reader is prepared to take more data
	110	* you will be called again with the requested
	111	* offset advanced by the number of bytes
	112	* absorbed. This interface is useful for files
	113	* no larger than the buffer.
	114	* 1) Set *start = an unsigned long value less than
	115	* the buffer address but greater than zero.
	116	* Put the data of the requested offset at the
	117	* beginning of the buffer. Return the number of
	118	* bytes of data placed there. If this number is
	119	* greater than zero and you didn't signal eof
	120	* and the reader is prepared to take more data
	121	* you will be called again with the requested
	122	* offset advanced by *start. This interface is
	123	* useful when you have a large file consisting
	124	* of a series of blocks which you want to count
	125	* and return as wholes.
	126	* (Hack by Paul.Russell@rustcorp.com.au)
	127	* 2) Set *start = an address within the buffer.
	128	* Put the data of the requested offset at *start.
	129	* Return the number of bytes of data placed there.
	130	* If this number is greater than zero and you
	131	* didn't signal eof and the reader is prepared to
	132	* take more data you will be called again with the
	133	* requested offset advanced by the number of bytes
	134	* absorbed.
	135	*/
	136	n = dp->read_proc(page, &start, *ppos,
	137	count, &eof, dp->data);
138	} else
139	break;
140
141	if (n == 0) /* end of file */
142	break;
143	if (n < 0) { /* error */
144	if (retval == 0)
145	retval = n;
146	break;
147	}
148
149	if (start == NULL) {
150	if (n > PAGE_SIZE) {
151	printk(KERN_ERR
152	"proc_file_read: Apparent buffer overflow!\n");
153	n = PAGE_SIZE;
154	}
155	n -= *ppos;
156	if (n <= 0)
157	break;
158	if (n > count)
159	n = count;
160	start = page + *ppos;
161	} else if (start < page) {
162	if (n > PAGE_SIZE) {
163	printk(KERN_ERR
164	"proc_file_read: Apparent buffer overflow!\n");
165	n = PAGE_SIZE;
166	}
167	if (n > count) {
168	/*
169	* Don't reduce n because doing so might
170	* cut off part of a data block.
171	*/
172	printk(KERN_WARNING
173	"proc_file_read: Read count exceeded\n");
174	}
175	} else /* start >= page */ {
176	unsigned long startoff = (unsigned long)(start - page);
177	if (n > (PAGE_SIZE - startoff)) {
178	printk(KERN_ERR
179	"proc_file_read: Apparent buffer overflow!\n");
180	n = PAGE_SIZE - startoff;
181	}
182	if (n > count)
183	n = count;
184	}
185
186	n -= copy_to_user(buf, start < page ? page : start, n);
187	if (n == 0) {
188	if (retval == 0)
189	retval = -EFAULT;
190	break;
191	}
192
193	*ppos += start < page ? (unsigned long)start : n;
194	nbytes -= n;
195	buf += n;
196	retval += n;
197	}
198	free_page((unsigned long) page);
199	return retval;
200	}
201
202	static ssize_t
203	proc_file_write(struct file file, const char __user buffer,
204	size_t count, loff_t *ppos)
205	{
206	struct inode *inode = file->f_dentry->d_inode;
207	struct proc_dir_entry * dp;
208
209	dp = PDE(inode);
210
211	if (!dp->write_proc)
212	return -EIO;
213
214	/* FIXME: does this routine need ppos? probably... */
215	return dp->write_proc(file, buffer, count, dp->data);
216	}
217
218
219	static loff_t
220	proc_file_lseek(struct file *file, loff_t offset, int orig)
221	{
8b90db0d LT	222	loff_t retval = -EINVAL;
	223	switch (orig) {
	224	case 1:
	225	offset += file->f_pos;
	226	/* fallthrough */
	227	case 0:
	228	if (offset < 0 \|\| offset > MAX_NON_LFS)
	229	break;
	230	file->f_pos = retval = offset;
	231	}
	232	return retval;
1da177e4 LT	233	}
	234
	235	static int proc_notify_change(struct dentry dentry, struct iattr iattr)
	236	{
	237	struct inode *inode = dentry->d_inode;
	238	struct proc_dir_entry *de = PDE(inode);
	239	int error;
	240
	241	error = inode_change_ok(inode, iattr);
	242	if (error)
	243	goto out;
	244
	245	error = inode_setattr(inode, iattr);
	246	if (error)
	247	goto out;
	248
	249	de->uid = inode->i_uid;
	250	de->gid = inode->i_gid;
	251	de->mode = inode->i_mode;
	252	out:
	253	return error;
	254	}
	255
2b579bee MS	256	static int proc_getattr(struct vfsmount mnt, struct dentry dentry,
	257	struct kstat *stat)
	258	{
	259	struct inode *inode = dentry->d_inode;
	260	struct proc_dir_entry *de = PROC_I(inode)->pde;
	261	if (de && de->nlink)
	262	inode->i_nlink = de->nlink;
	263
	264	generic_fillattr(inode, stat);
	265	return 0;
	266	}
	267
1da177e4 LT	268	static struct inode_operations proc_file_inode_operations = {
	269	.setattr = proc_notify_change,
	270	};
	271
	272	/*
	273	* This function parses a name such as "tty/driver/serial", and
	274	* returns the struct proc_dir_entry for "/proc/tty/driver", and
	275	* returns "serial" in residual.
	276	*/
	277	static int xlate_proc_name(const char *name,
	278	struct proc_dir_entry ret, const char residual)
	279	{
	280	const char cp = name, next;
	281	struct proc_dir_entry *de;
	282	int len;
64a07bd8	283	int rtn = 0;
1da177e4	284
64a07bd8	285	spin_lock(&proc_subdir_lock);
1da177e4 LT	286	de = &proc_root;
	287	while (1) {
	288	next = strchr(cp, '/');
	289	if (!next)
	290	break;
	291
	292	len = next - cp;
	293	for (de = de->subdir; de ; de = de->next) {
	294	if (proc_match(len, cp, de))
	295	break;
	296	}
64a07bd8 SR	297	if (!de) {
	298	rtn = -ENOENT;
	299	goto out;
	300	}
1da177e4 LT	301	cp += len + 1;
	302	}
	303	*residual = cp;
	304	*ret = de;
64a07bd8 SR	305	out:
	306	spin_unlock(&proc_subdir_lock);
	307	return rtn;
1da177e4 LT	308	}
	309
	310	static DEFINE_IDR(proc_inum_idr);
	311	static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
	312
	313	#define PROC_DYNAMIC_FIRST 0xF0000000UL
	314
	315	/*
	316	* Return an inode number between PROC_DYNAMIC_FIRST and
	317	* 0xffffffff, or zero on failure.
	318	*/
	319	static unsigned int get_inode_number(void)
	320	{
	321	int i, inum = 0;
	322	int error;
	323
	324	retry:
	325	if (idr_pre_get(&proc_inum_idr, GFP_KERNEL) == 0)
	326	return 0;
	327
	328	spin_lock(&proc_inum_lock);
	329	error = idr_get_new(&proc_inum_idr, NULL, &i);
	330	spin_unlock(&proc_inum_lock);
	331	if (error == -EAGAIN)
	332	goto retry;
	333	else if (error)
	334	return 0;
	335
	336	inum = (i & MAX_ID_MASK) + PROC_DYNAMIC_FIRST;
	337
	338	/* inum will never be more than 0xf0ffffff, so no check
	339	* for overflow.
	340	*/
	341
	342	return inum;
	343	}
	344
	345	static void release_inode_number(unsigned int inum)
	346	{
	347	int id = (inum - PROC_DYNAMIC_FIRST) \| ~MAX_ID_MASK;
	348
	349	spin_lock(&proc_inum_lock);
	350	idr_remove(&proc_inum_idr, id);
	351	spin_unlock(&proc_inum_lock);
	352	}
	353
008b150a	354	static void proc_follow_link(struct dentry dentry, struct nameidata *nd)
1da177e4 LT	355	{
1da177e4 LT	356	nd_set_link(nd, PDE(dentry->d_inode)->data);
008b150a	357	return NULL;
1da177e4 LT	358	}
	359
	360	static struct inode_operations proc_link_inode_operations = {
	361	.readlink = generic_readlink,
	362	.follow_link = proc_follow_link,
	363	};
	364
	365	/*
	366	* As some entries in /proc are volatile, we want to
	367	* get rid of unused dentries. This could be made
	368	* smarter: we could keep a "volatile" flag in the
	369	* inode to indicate which ones to keep.
	370	*/
	371	static int proc_delete_dentry(struct dentry * dentry)
	372	{
	373	return 1;
	374	}
	375
	376	static struct dentry_operations proc_dentry_operations =
	377	{
	378	.d_delete = proc_delete_dentry,
	379	};
	380
	381	/*
	382	* Don't create negative dentries here, return -ENOENT by hand
	383	* instead.
	384	*/
	385	struct dentry proc_lookup(struct inode dir, struct dentry dentry, struct nameidata nd)
	386	{
	387	struct inode *inode = NULL;
	388	struct proc_dir_entry * de;
	389	int error = -ENOENT;
	390
	391	lock_kernel();
64a07bd8	392	spin_lock(&proc_subdir_lock);
1da177e4 LT	393	de = PDE(dir);
	394	if (de) {
	395	for (de = de->subdir; de ; de = de->next) {
	396	if (de->namelen != dentry->d_name.len)
	397	continue;
	398	if (!memcmp(dentry->d_name.name, de->name, de->namelen)) {
	399	unsigned int ino = de->low_ino;
	400
64a07bd8	401	spin_unlock(&proc_subdir_lock);
1da177e4 LT	402	error = -EINVAL;
1da177e4 LT	403	inode = proc_get_inode(dir->i_sb, ino, de);
64a07bd8	404	spin_lock(&proc_subdir_lock);
1da177e4 LT	405	break;
	406	}
	407	}
	408	}
64a07bd8	409	spin_unlock(&proc_subdir_lock);
1da177e4 LT	410	unlock_kernel();
	411
	412	if (inode) {
	413	dentry->d_op = &proc_dentry_operations;
	414	d_add(dentry, inode);
	415	return NULL;
	416	}
	417	return ERR_PTR(error);
	418	}
	419
	420	/*
	421	* This returns non-zero if at EOF, so that the /proc
	422	* root directory can use this and check if it should
	423	* continue with the <pid> entries..
	424	*
	425	* Note that the VFS-layer doesn't care about the return
	426	* value of the readdir() call, as long as it's non-negative
	427	* for success..
	428	*/
	429	int proc_readdir(struct file * filp,
	430	void * dirent, filldir_t filldir)
	431	{
	432	struct proc_dir_entry * de;
	433	unsigned int ino;
	434	int i;
	435	struct inode *inode = filp->f_dentry->d_inode;
	436	int ret = 0;
	437
	438	lock_kernel();
	439
	440	ino = inode->i_ino;
	441	de = PDE(inode);
	442	if (!de) {
	443	ret = -EINVAL;
	444	goto out;
	445	}
	446	i = filp->f_pos;
	447	switch (i) {
	448	case 0:
	449	if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
	450	goto out;
	451	i++;
	452	filp->f_pos++;
	453	/* fall through */
	454	case 1:
	455	if (filldir(dirent, "..", 2, i,
	456	parent_ino(filp->f_dentry),
	457	DT_DIR) < 0)
	458	goto out;
	459	i++;
	460	filp->f_pos++;
	461	/* fall through */
	462	default:
64a07bd8	463	spin_lock(&proc_subdir_lock);
1da177e4 LT	464	de = de->subdir;
	465	i -= 2;
	466	for (;;) {
	467	if (!de) {
	468	ret = 1;
64a07bd8	469	spin_unlock(&proc_subdir_lock);
1da177e4 LT	470	goto out;
	471	}
	472	if (!i)
	473	break;
	474	de = de->next;
	475	i--;
	476	}
	477
	478	do {
64a07bd8 SR	479	/* filldir passes info to user space */
64a07bd8 SR	480	spin_unlock(&proc_subdir_lock);
1da177e4 LT	481	if (filldir(dirent, de->name, de->namelen, filp->f_pos,
	482	de->low_ino, de->mode >> 12) < 0)
	483	goto out;
64a07bd8	484	spin_lock(&proc_subdir_lock);
1da177e4 LT	485	filp->f_pos++;
	486	de = de->next;
	487	} while (de);
64a07bd8	488	spin_unlock(&proc_subdir_lock);
1da177e4 LT	489	}
	490	ret = 1;
	491	out: unlock_kernel();
	492	return ret;
	493	}
	494
	495	/*
	496	* These are the generic /proc directory operations. They
	497	* use the in-memory "struct proc_dir_entry" tree to parse
	498	* the /proc directory.
	499	*/
	500	static struct file_operations proc_dir_operations = {
	501	.read = generic_read_dir,
	502	.readdir = proc_readdir,
	503	};
	504
	505	/*
	506	* proc directories can do almost nothing..
	507	*/
	508	static struct inode_operations proc_dir_inode_operations = {
	509	.lookup = proc_lookup,
2b579bee	510	.getattr = proc_getattr,
1da177e4 LT	511	.setattr = proc_notify_change,
	512	};
	513
	514	static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp)
	515	{
	516	unsigned int i;
	517
	518	i = get_inode_number();
	519	if (i == 0)
	520	return -EAGAIN;
	521	dp->low_ino = i;
64a07bd8 SR	522
64a07bd8 SR	523	spin_lock(&proc_subdir_lock);
1da177e4 LT	524	dp->next = dir->subdir;
	525	dp->parent = dir;
	526	dir->subdir = dp;
64a07bd8 SR	527	spin_unlock(&proc_subdir_lock);
64a07bd8 SR	528
1da177e4 LT	529	if (S_ISDIR(dp->mode)) {
	530	if (dp->proc_iops == NULL) {
	531	dp->proc_fops = &proc_dir_operations;
	532	dp->proc_iops = &proc_dir_inode_operations;
	533	}
	534	dir->nlink++;
	535	} else if (S_ISLNK(dp->mode)) {
	536	if (dp->proc_iops == NULL)
	537	dp->proc_iops = &proc_link_inode_operations;
	538	} else if (S_ISREG(dp->mode)) {
	539	if (dp->proc_fops == NULL)
	540	dp->proc_fops = &proc_file_operations;
	541	if (dp->proc_iops == NULL)
	542	dp->proc_iops = &proc_file_inode_operations;
	543	}
	544	return 0;
	545	}
	546
	547	/*
	548	* Kill an inode that got unregistered..
	549	*/
	550	static void proc_kill_inodes(struct proc_dir_entry *de)
	551	{
	552	struct list_head *p;
	553	struct super_block *sb = proc_mnt->mnt_sb;
	554
	555	/*
	556	* Actually it's a partial revoke().
	557	*/
	558	file_list_lock();
	559	list_for_each(p, &sb->s_files) {
2f512016	560	struct file * filp = list_entry(p, struct file, f_u.fu_list);
1da177e4 LT	561	struct dentry * dentry = filp->f_dentry;
1da177e4 LT	562	struct inode * inode;
99ac48f5	563	const struct file_operations *fops;
1da177e4 LT	564
	565	if (dentry->d_op != &proc_dentry_operations)
	566	continue;
	567	inode = dentry->d_inode;
	568	if (PDE(inode) != de)
	569	continue;
	570	fops = filp->f_op;
	571	filp->f_op = NULL;
	572	fops_put(fops);
	573	}
	574	file_list_unlock();
	575	}
	576
	577	static struct proc_dir_entry proc_create(struct proc_dir_entry *parent,
	578	const char *name,
	579	mode_t mode,
	580	nlink_t nlink)
	581	{
	582	struct proc_dir_entry *ent = NULL;
	583	const char *fn = name;
	584	int len;
	585
	586	/* make sure name is valid */
	587	if (!name \|\| !strlen(name)) goto out;
	588
	589	if (!(*parent) && xlate_proc_name(name, parent, &fn) != 0)
	590	goto out;
	591
	592	/* At this point there must not be any '/' characters beyond fn /
	593	if (strchr(fn, '/'))
	594	goto out;
	595
	596	len = strlen(fn);
	597
	598	ent = kmalloc(sizeof(struct proc_dir_entry) + len + 1, GFP_KERNEL);
	599	if (!ent) goto out;
	600
	601	memset(ent, 0, sizeof(struct proc_dir_entry));
	602	memcpy(((char *) ent) + sizeof(struct proc_dir_entry), fn, len + 1);
	603	ent->name = ((char ) ent) + sizeof(ent);
	604	ent->namelen = len;
	605	ent->mode = mode;
	606	ent->nlink = nlink;
	607	out:
	608	return ent;
	609	}
	610
	611	struct proc_dir_entry proc_symlink(const char name,
	612	struct proc_dir_entry parent, const char dest)
	613	{
	614	struct proc_dir_entry *ent;
	615
	616	ent = proc_create(&parent,name,
	617	(S_IFLNK \| S_IRUGO \| S_IWUGO \| S_IXUGO),1);
	618
	619	if (ent) {
	620	ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
	621	if (ent->data) {
	622	strcpy((char*)ent->data,dest);
	623	if (proc_register(parent, ent) < 0) {
	624	kfree(ent->data);
	625	kfree(ent);
	626	ent = NULL;
	627	}
628	} else {
629	kfree(ent);
630	ent = NULL;
631	}
632	}
633	return ent;
634	}
635
636	struct proc_dir_entry proc_mkdir_mode(const char name, mode_t mode,
637	struct proc_dir_entry *parent)
638	{
639	struct proc_dir_entry *ent;
640
641	ent = proc_create(&parent, name, S_IFDIR \| mode, 2);
642	if (ent) {
643	ent->proc_fops = &proc_dir_operations;
644	ent->proc_iops = &proc_dir_inode_operations;
645
646	if (proc_register(parent, ent) < 0) {
647	kfree(ent);
648	ent = NULL;
649	}
650	}
651	return ent;
652	}
653
654	struct proc_dir_entry proc_mkdir(const char name,
655	struct proc_dir_entry *parent)
656	{
657	return proc_mkdir_mode(name, S_IRUGO \| S_IXUGO, parent);
658	}
659
660	struct proc_dir_entry create_proc_entry(const char name, mode_t mode,
661	struct proc_dir_entry *parent)
662	{
663	struct proc_dir_entry *ent;
664	nlink_t nlink;
665
666	if (S_ISDIR(mode)) {
667	if ((mode & S_IALLUGO) == 0)
668	mode \|= S_IRUGO \| S_IXUGO;
669	nlink = 2;
670	} else {
671	if ((mode & S_IFMT) == 0)
672	mode \|= S_IFREG;
673	if ((mode & S_IALLUGO) == 0)
674	mode \|= S_IRUGO;
675	nlink = 1;
676	}
677
678	ent = proc_create(&parent,name,mode,nlink);
679	if (ent) {
680	if (S_ISDIR(mode)) {
681	ent->proc_fops = &proc_dir_operations;
682	ent->proc_iops = &proc_dir_inode_operations;
683	}
684	if (proc_register(parent, ent) < 0) {
685	kfree(ent);
686	ent = NULL;
687	}
688	}
689	return ent;
690	}
691
692	void free_proc_entry(struct proc_dir_entry *de)
693	{
694	unsigned int ino = de->low_ino;
695
696	if (ino < PROC_DYNAMIC_FIRST)
697	return;
698
699	release_inode_number(ino);
700
701	if (S_ISLNK(de->mode) && de->data)
702	kfree(de->data);
703	kfree(de);
704	}
705
706	/*
707	* Remove a /proc entry and free it if it's not currently in use.
708	* If it is in use, we set the 'deleted' flag.
709	*/
710	void remove_proc_entry(const char name, struct proc_dir_entry parent)
711	{
712	struct proc_dir_entry **p;
713	struct proc_dir_entry *de;
714	const char *fn = name;
715	int len;
716
717	if (!parent && xlate_proc_name(name, &parent, &fn) != 0)
718	goto out;
719	len = strlen(fn);
64a07bd8 SR	720
64a07bd8 SR	721	spin_lock(&proc_subdir_lock);
1da177e4 LT	722	for (p = &parent->subdir; p; p=&(p)->next ) {
	723	if (!proc_match(len, fn, *p))
	724	continue;
	725	de = *p;
	726	*p = de->next;
	727	de->next = NULL;
	728	if (S_ISDIR(de->mode))
	729	parent->nlink--;
	730	proc_kill_inodes(de);
	731	de->nlink = 0;
	732	WARN_ON(de->subdir);
	733	if (!atomic_read(&de->count))
	734	free_proc_entry(de);
	735	else {
	736	de->deleted = 1;
	737	printk("remove_proc_entry: %s/%s busy, count=%d\n",
	738	parent->name, de->name, atomic_read(&de->count));
	739	}
	740	break;
	741	}
64a07bd8	742	spin_unlock(&proc_subdir_lock);
1da177e4 LT	743	out:
	744	return;
	745	}