[net-next-2.6.git] / net / sched / sch_sfq.c

/*
 * net/sched/sch_sfq.c	Stochastic Fairness Queueing discipline.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/jiffies.h>
#include <linux/string.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/ipv6.h>
#include <linux/skbuff.h>
#include <linux/jhash.h>
#include <linux/slab.h>
#include <net/ip.h>
#include <net/netlink.h>
#include <net/pkt_sched.h>


/*	Stochastic Fairness Queuing algorithm.
	=======================================

	Source:
	Paul E. McKenney "Stochastic Fairness Queuing",
	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.

	Paul E. McKenney "Stochastic Fairness Queuing",
	"Interworking: Research and Experience", v.2, 1991, p.113-131.


	See also:
	M. Shreedhar and George Varghese "Efficient Fair
	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.


	This is not the thing that is usually called (W)FQ nowadays.
	It does not use any timestamp mechanism, but instead
	processes queues in round-robin order.

	ADVANTAGE:

	- It is very cheap. Both CPU and memory requirements are minimal.

	DRAWBACKS:

	- "Stochastic" -> It is not 100% fair.
	When hash collisions occur, several flows are considered as one.

	- "Round-robin" -> It introduces larger delays than virtual clock
	based schemes, and should not be used for isolating interactive
	traffic	from non-interactive. It means, that this scheduler
	should be used as leaf of CBQ or P3, which put interactive traffic
	to higher priority band.

	We still need true WFQ for top level CSZ, but using WFQ
	for the best effort traffic is absolutely pointless:
	SFQ is superior for this purpose.

	IMPLEMENTATION:
	This implementation limits maximal queue length to 128;
	maximal mtu to 2^15-1; number of hash buckets to 1024.
	The only goal of this restrictions was that all data
	fit into one 4K page :-). Struct sfq_sched_data is
	organized in anti-cache manner: all the data for a bucket
	are scattered over different locations. This is not good,
	but it allowed me to put it into 4K.

	It is easy to increase these values, but not in flight.  */

#define SFQ_DEPTH		128
#define SFQ_HASH_DIVISOR	1024

/* This type should contain at least SFQ_DEPTH*2 values */
typedef unsigned char sfq_index;

struct sfq_head
{
	sfq_index	next;
	sfq_index	prev;
};

struct sfq_sched_data
{
/* Parameters */
	int		perturb_period;
	unsigned	quantum;	/* Allotment per round: MUST BE >= MTU */
	int		limit;

/* Variables */
	struct tcf_proto *filter_list;
	struct timer_list perturb_timer;
	u32		perturbation;
	sfq_index	tail;		/* Index of current slot in round */
	sfq_index	max_depth;	/* Maximal depth */

	sfq_index	ht[SFQ_HASH_DIVISOR];	/* Hash table */
	sfq_index	next[SFQ_DEPTH];	/* Active slots link */
	short		allot[SFQ_DEPTH];	/* Current allotment per slot */
	unsigned short	hash[SFQ_DEPTH];	/* Hash value indexed by slots */
	struct sk_buff_head	qs[SFQ_DEPTH];		/* Slot queue */
	struct sfq_head	dep[SFQ_DEPTH*2];	/* Linked list of slots, indexed by depth */
};

static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
{
	return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
}

static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
{
	u32 h, h2;

	switch (skb->protocol) {
	case htons(ETH_P_IP):
	{
		const struct iphdr *iph;
		int poff;

		if (!pskb_network_may_pull(skb, sizeof(*iph)))
			goto err;
		iph = ip_hdr(skb);
		h = (__force u32)iph->daddr;
		h2 = (__force u32)iph->saddr ^ iph->protocol;
		if (iph->frag_off & htons(IP_MF|IP_OFFSET))
			break;
		poff = proto_ports_offset(iph->protocol);
		if (poff >= 0 &&
		    pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
			iph = ip_hdr(skb);
			h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff);
		}
		break;
	}
	case htons(ETH_P_IPV6):
	{
		struct ipv6hdr *iph;
		int poff;

		if (!pskb_network_may_pull(skb, sizeof(*iph)))
			goto err;
		iph = ipv6_hdr(skb);
		h = (__force u32)iph->daddr.s6_addr32[3];
		h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
		poff = proto_ports_offset(iph->nexthdr);
		if (poff >= 0 &&
		    pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
			iph = ipv6_hdr(skb);
			h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff);
		}
		break;
	}
	default:
err:
		h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
		h2 = (unsigned long)skb->sk;
	}

	return sfq_fold_hash(q, h, h2);
}

static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
				 int *qerr)
{
	struct sfq_sched_data *q = qdisc_priv(sch);
	struct tcf_result res;
	int result;

	if (TC_H_MAJ(skb->priority) == sch->handle &&
	    TC_H_MIN(skb->priority) > 0 &&
	    TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
		return TC_H_MIN(skb->priority);

	if (!q->filter_list)
		return sfq_hash(q, skb) + 1;

	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
	result = tc_classify(skb, q->filter_list, &res);
	if (result >= 0) {
#ifdef CONFIG_NET_CLS_ACT
		switch (result) {
		case TC_ACT_STOLEN:
		case TC_ACT_QUEUED:
			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
		case TC_ACT_SHOT:
			return 0;
		}
#endif
		if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
			return TC_H_MIN(res.classid);
	}
	return 0;
}

static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
{
	sfq_index p, n;
	int d = q->qs[x].qlen + SFQ_DEPTH;

	p = d;
	n = q->dep[d].next;
	q->dep[x].next = n;
	q->dep[x].prev = p;
	q->dep[p].next = q->dep[n].prev = x;
}

static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
{
	sfq_index p, n;

	n = q->dep[x].next;
	p = q->dep[x].prev;
	q->dep[p].next = n;
	q->dep[n].prev = p;

	if (n == p && q->max_depth == q->qs[x].qlen + 1)
		q->max_depth--;

	sfq_link(q, x);
}

static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
{
	sfq_index p, n;
	int d;

	n = q->dep[x].next;
	p = q->dep[x].prev;
	q->dep[p].next = n;
	q->dep[n].prev = p;
	d = q->qs[x].qlen;
	if (q->max_depth < d)
		q->max_depth = d;

	sfq_link(q, x);
}

static unsigned int sfq_drop(struct Qdisc *sch)
{
	struct sfq_sched_data *q = qdisc_priv(sch);
	sfq_index d = q->max_depth;
	struct sk_buff *skb;
	unsigned int len;

	/* Queue is full! Find the longest slot and
	   drop a packet from it */

	if (d > 1) {
		sfq_index x = q->dep[d + SFQ_DEPTH].next;
		skb = q->qs[x].prev;
		len = qdisc_pkt_len(skb);
		__skb_unlink(skb, &q->qs[x]);
		kfree_skb(skb);
		sfq_dec(q, x);
		sch->q.qlen--;
		sch->qstats.drops++;
		sch->qstats.backlog -= len;
		return len;
	}

	if (d == 1) {
		/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
		d = q->next[q->tail];
		q->next[q->tail] = q->next[d];
		q->allot[q->next[d]] += q->quantum;
		skb = q->qs[d].prev;
		len = qdisc_pkt_len(skb);
		__skb_unlink(skb, &q->qs[d]);
		kfree_skb(skb);
		sfq_dec(q, d);
		sch->q.qlen--;
		q->ht[q->hash[d]] = SFQ_DEPTH;
		sch->qstats.drops++;
		sch->qstats.backlog -= len;
		return len;
	}

	return 0;
}

static int
sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
	struct sfq_sched_data *q = qdisc_priv(sch);
	unsigned int hash;
	sfq_index x;
	int uninitialized_var(ret);

	hash = sfq_classify(skb, sch, &ret);
	if (hash == 0) {
		if (ret & __NET_XMIT_BYPASS)
			sch->qstats.drops++;
		kfree_skb(skb);
		return ret;
	}
	hash--;

	x = q->ht[hash];
	if (x == SFQ_DEPTH) {
		q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
		q->hash[x] = hash;
	}

	/* If selected queue has length q->limit, this means that
	 * all another queues are empty and that we do simple tail drop,
	 * i.e. drop _this_ packet.
	 */
	if (q->qs[x].qlen >= q->limit)
		return qdisc_drop(skb, sch);

	sch->qstats.backlog += qdisc_pkt_len(skb);
	__skb_queue_tail(&q->qs[x], skb);
	sfq_inc(q, x);
	if (q->qs[x].qlen == 1) {		/* The flow is new */
		if (q->tail == SFQ_DEPTH) {	/* It is the first flow */
			q->tail = x;
			q->next[x] = x;
			q->allot[x] = q->quantum;
		} else {
			q->next[x] = q->next[q->tail];
			q->next[q->tail] = x;
			q->tail = x;
		}
	}
	if (++sch->q.qlen <= q->limit) {
		sch->bstats.bytes += qdisc_pkt_len(skb);
		sch->bstats.packets++;
		return NET_XMIT_SUCCESS;
	}

	sfq_drop(sch);
	return NET_XMIT_CN;
}

static struct sk_buff *
sfq_peek(struct Qdisc *sch)
{
	struct sfq_sched_data *q = qdisc_priv(sch);
	sfq_index a;

	/* No active slots */
	if (q->tail == SFQ_DEPTH)
		return NULL;

	a = q->next[q->tail];
	return skb_peek(&q->qs[a]);
}

static struct sk_buff *
sfq_dequeue(struct Qdisc *sch)
{
	struct sfq_sched_data *q = qdisc_priv(sch);
	struct sk_buff *skb;
	sfq_index a, old_a;

	/* No active slots */
	if (q->tail == SFQ_DEPTH)
		return NULL;

	a = old_a = q->next[q->tail];

	/* Grab packet */
	skb = __skb_dequeue(&q->qs[a]);
	sfq_dec(q, a);
	sch->q.qlen--;
	sch->qstats.backlog -= qdisc_pkt_len(skb);

	/* Is the slot empty? */
	if (q->qs[a].qlen == 0) {
		q->ht[q->hash[a]] = SFQ_DEPTH;
		a = q->next[a];
		if (a == old_a) {
			q->tail = SFQ_DEPTH;
			return skb;
		}
		q->next[q->tail] = a;
		q->allot[a] += q->quantum;
	} else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
		q->tail = a;
		a = q->next[a];
		q->allot[a] += q->quantum;
	}
	return skb;
}

static void
sfq_reset(struct Qdisc *sch)
{
	struct sk_buff *skb;

	while ((skb = sfq_dequeue(sch)) != NULL)
		kfree_skb(skb);
}

static void sfq_perturbation(unsigned long arg)
{
	struct Qdisc *sch = (struct Qdisc *)arg;
	struct sfq_sched_data *q = qdisc_priv(sch);

	q->perturbation = net_random();

	if (q->perturb_period)
		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
}

static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
{
	struct sfq_sched_data *q = qdisc_priv(sch);
	struct tc_sfq_qopt *ctl = nla_data(opt);
	unsigned int qlen;

	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
		return -EINVAL;

	sch_tree_lock(sch);
	q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
	q->perturb_period = ctl->perturb_period * HZ;
	if (ctl->limit)
		q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);

	qlen = sch->q.qlen;
	while (sch->q.qlen > q->limit)
		sfq_drop(sch);
	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);

	del_timer(&q->perturb_timer);
	if (q->perturb_period) {
		mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
		q->perturbation = net_random();
	}
	sch_tree_unlock(sch);
	return 0;
}

static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
{
	struct sfq_sched_data *q = qdisc_priv(sch);
	int i;

	q->perturb_timer.function = sfq_perturbation;
	q->perturb_timer.data = (unsigned long)sch;
	init_timer_deferrable(&q->perturb_timer);

	for (i = 0; i < SFQ_HASH_DIVISOR; i++)
		q->ht[i] = SFQ_DEPTH;

	for (i = 0; i < SFQ_DEPTH; i++) {
		skb_queue_head_init(&q->qs[i]);
		q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
		q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
	}

	q->limit = SFQ_DEPTH - 1;
	q->max_depth = 0;
	q->tail = SFQ_DEPTH;
	if (opt == NULL) {
		q->quantum = psched_mtu(qdisc_dev(sch));
		q->perturb_period = 0;
		q->perturbation = net_random();
	} else {
		int err = sfq_change(sch, opt);
		if (err)
			return err;
	}

	for (i = 0; i < SFQ_DEPTH; i++)
		sfq_link(q, i);
	return 0;
}

static void sfq_destroy(struct Qdisc *sch)
{
	struct sfq_sched_data *q = qdisc_priv(sch);

	tcf_destroy_chain(&q->filter_list);
	q->perturb_period = 0;
	del_timer_sync(&q->perturb_timer);
}

static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
	struct sfq_sched_data *q = qdisc_priv(sch);
	unsigned char *b = skb_tail_pointer(skb);
	struct tc_sfq_qopt opt;

	opt.quantum = q->quantum;
	opt.perturb_period = q->perturb_period / HZ;

	opt.limit = q->limit;
	opt.divisor = SFQ_HASH_DIVISOR;
	opt.flows = q->limit;

	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);

	return skb->len;

nla_put_failure:
	nlmsg_trim(skb, b);
	return -1;
}

static struct Qdisc *sfq_leaf(struct Qdisc *sch, unsigned long arg)
{
	return NULL;
}

static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
{
	return 0;
}

static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
			      u32 classid)
{
	return 0;
}

static void sfq_put(struct Qdisc *q, unsigned long cl)
{
}

static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
{
	struct sfq_sched_data *q = qdisc_priv(sch);

	if (cl)
		return NULL;
	return &q->filter_list;
}

static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
			  struct sk_buff *skb, struct tcmsg *tcm)
{
	tcm->tcm_handle |= TC_H_MIN(cl);
	return 0;
}

static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
				struct gnet_dump *d)
{
	struct sfq_sched_data *q = qdisc_priv(sch);
	sfq_index idx = q->ht[cl-1];
	struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
	struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };

	if (gnet_stats_copy_queue(d, &qs) < 0)
		return -1;
	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
}

static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
	struct sfq_sched_data *q = qdisc_priv(sch);
	unsigned int i;

	if (arg->stop)
		return;

	for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
		if (q->ht[i] == SFQ_DEPTH ||
		    arg->count < arg->skip) {
			arg->count++;
			continue;
		}
		if (arg->fn(sch, i + 1, arg) < 0) {
			arg->stop = 1;
			break;
		}
		arg->count++;
	}
}

static const struct Qdisc_class_ops sfq_class_ops = {
	.leaf		=	sfq_leaf,
	.get		=	sfq_get,
	.put		=	sfq_put,
	.tcf_chain	=	sfq_find_tcf,
	.bind_tcf	=	sfq_bind,
	.unbind_tcf	=	sfq_put,
	.dump		=	sfq_dump_class,
	.dump_stats	=	sfq_dump_class_stats,
	.walk		=	sfq_walk,
};

static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
	.cl_ops		=	&sfq_class_ops,
	.id		=	"sfq",
	.priv_size	=	sizeof(struct sfq_sched_data),
	.enqueue	=	sfq_enqueue,
	.dequeue	=	sfq_dequeue,
	.peek		=	sfq_peek,
	.drop		=	sfq_drop,
	.init		=	sfq_init,
	.reset		=	sfq_reset,
	.destroy	=	sfq_destroy,
	.change		=	NULL,
	.dump		=	sfq_dump,
	.owner		=	THIS_MODULE,
};

static int __init sfq_module_init(void)
{
	return register_qdisc(&sfq_qdisc_ops);
}
static void __exit sfq_module_exit(void)
{
	unregister_qdisc(&sfq_qdisc_ops);
}
module_init(sfq_module_init)
module_exit(sfq_module_exit)
MODULE_LICENSE("GPL");
Commit	Line	Data
	1	/*
	2	* net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
	3	*
	4	* This program is free software; you can redistribute it and/or
	5	* modify it under the terms of the GNU General Public License
	6	* as published by the Free Software Foundation; either version
	7	* 2 of the License, or (at your option) any later version.
	8	*
	9	* Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
	10	*/
	11
	12	#include <linux/module.h>
	13	#include <linux/types.h>
	14	#include <linux/kernel.h>
	15	#include <linux/jiffies.h>
	16	#include <linux/string.h>
	17	#include <linux/in.h>
	18	#include <linux/errno.h>
	19	#include <linux/init.h>
	20	#include <linux/ipv6.h>
	21	#include <linux/skbuff.h>
	22	#include <linux/jhash.h>
	23	#include <linux/slab.h>
	24	#include <net/ip.h>
	25	#include <net/netlink.h>
	26	#include <net/pkt_sched.h>
	27
	28
	29	/* Stochastic Fairness Queuing algorithm.
	30	=======================================
	31
	32	Source:
	33	Paul E. McKenney "Stochastic Fairness Queuing",
	34	IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
	35
	36	Paul E. McKenney "Stochastic Fairness Queuing",
	37	"Interworking: Research and Experience", v.2, 1991, p.113-131.
	38
	39
	40	See also:
	41	M. Shreedhar and George Varghese "Efficient Fair
	42	Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
	43
	44
	45	This is not the thing that is usually called (W)FQ nowadays.
	46	It does not use any timestamp mechanism, but instead
	47	processes queues in round-robin order.
	48
	49	ADVANTAGE:
	50
	51	- It is very cheap. Both CPU and memory requirements are minimal.
	52
	53	DRAWBACKS:
	54
	55	- "Stochastic" -> It is not 100% fair.
	56	When hash collisions occur, several flows are considered as one.
	57
	58	- "Round-robin" -> It introduces larger delays than virtual clock
	59	based schemes, and should not be used for isolating interactive
	60	traffic from non-interactive. It means, that this scheduler
	61	should be used as leaf of CBQ or P3, which put interactive traffic
	62	to higher priority band.
	63
	64	We still need true WFQ for top level CSZ, but using WFQ
	65	for the best effort traffic is absolutely pointless:
	66	SFQ is superior for this purpose.
	67
	68	IMPLEMENTATION:
	69	This implementation limits maximal queue length to 128;
	70	maximal mtu to 2^15-1; number of hash buckets to 1024.
	71	The only goal of this restrictions was that all data
	72	fit into one 4K page :-). Struct sfq_sched_data is
	73	organized in anti-cache manner: all the data for a bucket
	74	are scattered over different locations. This is not good,
	75	but it allowed me to put it into 4K.
	76
	77	It is easy to increase these values, but not in flight. */
	78
	79	#define SFQ_DEPTH 128
	80	#define SFQ_HASH_DIVISOR 1024
	81
	82	/* This type should contain at least SFQ_DEPTH2 values /
	83	typedef unsigned char sfq_index;
	84
	85	struct sfq_head
	86	{
	87	sfq_index next;
	88	sfq_index prev;
	89	};
	90
	91	struct sfq_sched_data
	92	{
	93	/* Parameters */
	94	int perturb_period;
	95	unsigned quantum; /* Allotment per round: MUST BE >= MTU */
	96	int limit;
	97
	98	/* Variables */
	99	struct tcf_proto *filter_list;
	100	struct timer_list perturb_timer;
	101	u32 perturbation;
	102	sfq_index tail; /* Index of current slot in round */
	103	sfq_index max_depth; /* Maximal depth */
	104
	105	sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */
	106	sfq_index next[SFQ_DEPTH]; /* Active slots link */
	107	short allot[SFQ_DEPTH]; /* Current allotment per slot */
	108	unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */
	109	struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */
	110	struct sfq_head dep[SFQ_DEPTH2]; / Linked list of slots, indexed by depth */
	111	};
	112
	113	static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
	114	{
	115	return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
	116	}
	117
	118	static unsigned sfq_hash(struct sfq_sched_data q, struct sk_buff skb)
	119	{
	120	u32 h, h2;
	121
	122	switch (skb->protocol) {
	123	case htons(ETH_P_IP):
	124	{
	125	const struct iphdr *iph;
	126	int poff;
	127
	128	if (!pskb_network_may_pull(skb, sizeof(*iph)))
	129	goto err;
	130	iph = ip_hdr(skb);
	131	h = (__force u32)iph->daddr;
	132	h2 = (__force u32)iph->saddr ^ iph->protocol;
	133	if (iph->frag_off & htons(IP_MF\|IP_OFFSET))
	134	break;
	135	poff = proto_ports_offset(iph->protocol);
	136	if (poff >= 0 &&
	137	pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) {
	138	iph = ip_hdr(skb);
	139	h2 ^= (u32)((void )iph + iph->ihl 4 + poff);
	140	}
	141	break;
	142	}
	143	case htons(ETH_P_IPV6):
	144	{
	145	struct ipv6hdr *iph;
	146	int poff;
	147
	148	if (!pskb_network_may_pull(skb, sizeof(*iph)))
	149	goto err;
	150	iph = ipv6_hdr(skb);
	151	h = (__force u32)iph->daddr.s6_addr32[3];
	152	h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr;
	153	poff = proto_ports_offset(iph->nexthdr);
	154	if (poff >= 0 &&
	155	pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) {
	156	iph = ipv6_hdr(skb);
	157	h2 ^= (u32)((void )iph + sizeof(iph) + poff);
	158	}
	159	break;
	160	}
	161	default:
	162	err:
	163	h = (unsigned long)skb_dst(skb) ^ (__force u32)skb->protocol;
	164	h2 = (unsigned long)skb->sk;
	165	}
	166
	167	return sfq_fold_hash(q, h, h2);
	168	}
	169
	170	static unsigned int sfq_classify(struct sk_buff skb, struct Qdisc sch,
	171	int *qerr)
	172	{
	173	struct sfq_sched_data *q = qdisc_priv(sch);
	174	struct tcf_result res;
	175	int result;
	176
	177	if (TC_H_MAJ(skb->priority) == sch->handle &&
	178	TC_H_MIN(skb->priority) > 0 &&
	179	TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
	180	return TC_H_MIN(skb->priority);
	181
	182	if (!q->filter_list)
	183	return sfq_hash(q, skb) + 1;
	184
	185	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_BYPASS;
	186	result = tc_classify(skb, q->filter_list, &res);
	187	if (result >= 0) {
	188	#ifdef CONFIG_NET_CLS_ACT
	189	switch (result) {
	190	case TC_ACT_STOLEN:
	191	case TC_ACT_QUEUED:
	192	*qerr = NET_XMIT_SUCCESS \| __NET_XMIT_STOLEN;
	193	case TC_ACT_SHOT:
	194	return 0;
	195	}
	196	#endif
	197	if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
	198	return TC_H_MIN(res.classid);
	199	}
	200	return 0;
	201	}
	202
	203	static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
	204	{
	205	sfq_index p, n;
	206	int d = q->qs[x].qlen + SFQ_DEPTH;
	207
	208	p = d;
	209	n = q->dep[d].next;
	210	q->dep[x].next = n;
	211	q->dep[x].prev = p;
	212	q->dep[p].next = q->dep[n].prev = x;
	213	}
	214
	215	static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
	216	{
	217	sfq_index p, n;
	218
	219	n = q->dep[x].next;
	220	p = q->dep[x].prev;
	221	q->dep[p].next = n;
	222	q->dep[n].prev = p;
	223
	224	if (n == p && q->max_depth == q->qs[x].qlen + 1)
	225	q->max_depth--;
	226
	227	sfq_link(q, x);
	228	}
	229
	230	static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
	231	{
	232	sfq_index p, n;
	233	int d;
	234
	235	n = q->dep[x].next;
	236	p = q->dep[x].prev;
	237	q->dep[p].next = n;
	238	q->dep[n].prev = p;
	239	d = q->qs[x].qlen;
	240	if (q->max_depth < d)
	241	q->max_depth = d;
	242
	243	sfq_link(q, x);
	244	}
	245
	246	static unsigned int sfq_drop(struct Qdisc *sch)
	247	{
	248	struct sfq_sched_data *q = qdisc_priv(sch);
	249	sfq_index d = q->max_depth;
	250	struct sk_buff *skb;
	251	unsigned int len;
	252
	253	/* Queue is full! Find the longest slot and
	254	drop a packet from it */
	255
	256	if (d > 1) {
	257	sfq_index x = q->dep[d + SFQ_DEPTH].next;
	258	skb = q->qs[x].prev;
	259	len = qdisc_pkt_len(skb);
	260	__skb_unlink(skb, &q->qs[x]);
	261	kfree_skb(skb);
	262	sfq_dec(q, x);
	263	sch->q.qlen--;
	264	sch->qstats.drops++;
	265	sch->qstats.backlog -= len;
	266	return len;
	267	}
	268
	269	if (d == 1) {
	270	/* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
	271	d = q->next[q->tail];
	272	q->next[q->tail] = q->next[d];
	273	q->allot[q->next[d]] += q->quantum;
	274	skb = q->qs[d].prev;
	275	len = qdisc_pkt_len(skb);
	276	__skb_unlink(skb, &q->qs[d]);
	277	kfree_skb(skb);
	278	sfq_dec(q, d);
	279	sch->q.qlen--;
	280	q->ht[q->hash[d]] = SFQ_DEPTH;
	281	sch->qstats.drops++;
	282	sch->qstats.backlog -= len;
	283	return len;
	284	}
	285
	286	return 0;
	287	}
	288
	289	static int
	290	sfq_enqueue(struct sk_buff skb, struct Qdisc sch)
	291	{
	292	struct sfq_sched_data *q = qdisc_priv(sch);
	293	unsigned int hash;
	294	sfq_index x;
	295	int uninitialized_var(ret);
	296
	297	hash = sfq_classify(skb, sch, &ret);
	298	if (hash == 0) {
	299	if (ret & __NET_XMIT_BYPASS)
	300	sch->qstats.drops++;
	301	kfree_skb(skb);
	302	return ret;
	303	}
	304	hash--;
	305
	306	x = q->ht[hash];
	307	if (x == SFQ_DEPTH) {
	308	q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
	309	q->hash[x] = hash;
	310	}
	311
	312	/* If selected queue has length q->limit, this means that
	313	* all another queues are empty and that we do simple tail drop,
	314	* i.e. drop _this_ packet.
	315	*/
	316	if (q->qs[x].qlen >= q->limit)
	317	return qdisc_drop(skb, sch);
	318
	319	sch->qstats.backlog += qdisc_pkt_len(skb);
	320	__skb_queue_tail(&q->qs[x], skb);
	321	sfq_inc(q, x);
	322	if (q->qs[x].qlen == 1) { /* The flow is new */
	323	if (q->tail == SFQ_DEPTH) { /* It is the first flow */
	324	q->tail = x;
	325	q->next[x] = x;
	326	q->allot[x] = q->quantum;
	327	} else {
	328	q->next[x] = q->next[q->tail];
	329	q->next[q->tail] = x;
	330	q->tail = x;
	331	}
	332	}
	333	if (++sch->q.qlen <= q->limit) {
	334	sch->bstats.bytes += qdisc_pkt_len(skb);
	335	sch->bstats.packets++;
	336	return NET_XMIT_SUCCESS;
	337	}
	338
	339	sfq_drop(sch);
	340	return NET_XMIT_CN;
	341	}
	342
	343	static struct sk_buff *
	344	sfq_peek(struct Qdisc *sch)
	345	{
	346	struct sfq_sched_data *q = qdisc_priv(sch);
	347	sfq_index a;
	348
	349	/* No active slots */
	350	if (q->tail == SFQ_DEPTH)
	351	return NULL;
	352
	353	a = q->next[q->tail];
	354	return skb_peek(&q->qs[a]);
	355	}
	356
	357	static struct sk_buff *
	358	sfq_dequeue(struct Qdisc *sch)
	359	{
	360	struct sfq_sched_data *q = qdisc_priv(sch);
	361	struct sk_buff *skb;
	362	sfq_index a, old_a;
	363
	364	/* No active slots */
	365	if (q->tail == SFQ_DEPTH)
	366	return NULL;
	367
	368	a = old_a = q->next[q->tail];
	369
	370	/* Grab packet */
	371	skb = __skb_dequeue(&q->qs[a]);
	372	sfq_dec(q, a);
	373	sch->q.qlen--;
	374	sch->qstats.backlog -= qdisc_pkt_len(skb);
	375
	376	/* Is the slot empty? */
	377	if (q->qs[a].qlen == 0) {
	378	q->ht[q->hash[a]] = SFQ_DEPTH;
	379	a = q->next[a];
	380	if (a == old_a) {
	381	q->tail = SFQ_DEPTH;
	382	return skb;
	383	}
	384	q->next[q->tail] = a;
	385	q->allot[a] += q->quantum;
	386	} else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
	387	q->tail = a;
	388	a = q->next[a];
	389	q->allot[a] += q->quantum;
	390	}
	391	return skb;
	392	}
	393
	394	static void
	395	sfq_reset(struct Qdisc *sch)
	396	{
	397	struct sk_buff *skb;
	398
	399	while ((skb = sfq_dequeue(sch)) != NULL)
	400	kfree_skb(skb);
	401	}
	402
	403	static void sfq_perturbation(unsigned long arg)
	404	{
	405	struct Qdisc sch = (struct Qdisc )arg;
	406	struct sfq_sched_data *q = qdisc_priv(sch);
	407
	408	q->perturbation = net_random();
	409
	410	if (q->perturb_period)
	411	mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
	412	}
	413
	414	static int sfq_change(struct Qdisc sch, struct nlattr opt)
	415	{
	416	struct sfq_sched_data *q = qdisc_priv(sch);
	417	struct tc_sfq_qopt *ctl = nla_data(opt);
	418	unsigned int qlen;
	419
	420	if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
	421	return -EINVAL;
	422
	423	sch_tree_lock(sch);
	424	q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
	425	q->perturb_period = ctl->perturb_period * HZ;
	426	if (ctl->limit)
	427	q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
	428
	429	qlen = sch->q.qlen;
	430	while (sch->q.qlen > q->limit)
	431	sfq_drop(sch);
	432	qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
	433
	434	del_timer(&q->perturb_timer);
	435	if (q->perturb_period) {
	436	mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
	437	q->perturbation = net_random();
	438	}
	439	sch_tree_unlock(sch);
	440	return 0;
	441	}
	442
	443	static int sfq_init(struct Qdisc sch, struct nlattr opt)
	444	{
	445	struct sfq_sched_data *q = qdisc_priv(sch);
	446	int i;
	447
	448	q->perturb_timer.function = sfq_perturbation;
	449	q->perturb_timer.data = (unsigned long)sch;
	450	init_timer_deferrable(&q->perturb_timer);
	451
	452	for (i = 0; i < SFQ_HASH_DIVISOR; i++)
	453	q->ht[i] = SFQ_DEPTH;
	454
	455	for (i = 0; i < SFQ_DEPTH; i++) {
	456	skb_queue_head_init(&q->qs[i]);
	457	q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
	458	q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
	459	}
	460
	461	q->limit = SFQ_DEPTH - 1;
	462	q->max_depth = 0;
	463	q->tail = SFQ_DEPTH;
	464	if (opt == NULL) {
	465	q->quantum = psched_mtu(qdisc_dev(sch));
	466	q->perturb_period = 0;
	467	q->perturbation = net_random();
	468	} else {
	469	int err = sfq_change(sch, opt);
	470	if (err)
	471	return err;
	472	}
	473
	474	for (i = 0; i < SFQ_DEPTH; i++)
	475	sfq_link(q, i);
	476	return 0;
	477	}
	478
	479	static void sfq_destroy(struct Qdisc *sch)
	480	{
	481	struct sfq_sched_data *q = qdisc_priv(sch);
	482
	483	tcf_destroy_chain(&q->filter_list);
	484	q->perturb_period = 0;
	485	del_timer_sync(&q->perturb_timer);
	486	}
	487
	488	static int sfq_dump(struct Qdisc sch, struct sk_buff skb)
	489	{
	490	struct sfq_sched_data *q = qdisc_priv(sch);
	491	unsigned char *b = skb_tail_pointer(skb);
	492	struct tc_sfq_qopt opt;
	493
	494	opt.quantum = q->quantum;
	495	opt.perturb_period = q->perturb_period / HZ;
	496
	497	opt.limit = q->limit;
	498	opt.divisor = SFQ_HASH_DIVISOR;
	499	opt.flows = q->limit;
	500
	501	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
	502
	503	return skb->len;
	504
	505	nla_put_failure:
	506	nlmsg_trim(skb, b);
	507	return -1;
	508	}
	509
	510	static struct Qdisc sfq_leaf(struct Qdisc sch, unsigned long arg)
	511	{
	512	return NULL;
	513	}
	514
	515	static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
	516	{
	517	return 0;
	518	}
	519
	520	static unsigned long sfq_bind(struct Qdisc *sch, unsigned long parent,
	521	u32 classid)
	522	{
	523	return 0;
	524	}
	525
	526	static void sfq_put(struct Qdisc *q, unsigned long cl)
	527	{
	528	}
	529
	530	static struct tcf_proto *sfq_find_tcf(struct Qdisc sch, unsigned long cl)
	531	{
	532	struct sfq_sched_data *q = qdisc_priv(sch);
	533
	534	if (cl)
	535	return NULL;
	536	return &q->filter_list;
	537	}
	538
	539	static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
	540	struct sk_buff skb, struct tcmsg tcm)
	541	{
	542	tcm->tcm_handle \|= TC_H_MIN(cl);
	543	return 0;
	544	}
	545
	546	static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
	547	struct gnet_dump *d)
	548	{
	549	struct sfq_sched_data *q = qdisc_priv(sch);
	550	sfq_index idx = q->ht[cl-1];
	551	struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
	552	struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
	553
	554	if (gnet_stats_copy_queue(d, &qs) < 0)
	555	return -1;
	556	return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
	557	}
	558
	559	static void sfq_walk(struct Qdisc sch, struct qdisc_walker arg)
	560	{
	561	struct sfq_sched_data *q = qdisc_priv(sch);
	562	unsigned int i;
	563
	564	if (arg->stop)
	565	return;
	566
	567	for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
	568	if (q->ht[i] == SFQ_DEPTH \|\|
	569	arg->count < arg->skip) {
	570	arg->count++;
	571	continue;
	572	}
	573	if (arg->fn(sch, i + 1, arg) < 0) {
	574	arg->stop = 1;
	575	break;
	576	}
	577	arg->count++;
	578	}
	579	}
	580
	581	static const struct Qdisc_class_ops sfq_class_ops = {
	582	.leaf = sfq_leaf,
	583	.get = sfq_get,
	584	.put = sfq_put,
	585	.tcf_chain = sfq_find_tcf,
	586	.bind_tcf = sfq_bind,
	587	.unbind_tcf = sfq_put,
	588	.dump = sfq_dump_class,
	589	.dump_stats = sfq_dump_class_stats,
	590	.walk = sfq_walk,
	591	};
	592
	593	static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
	594	.cl_ops = &sfq_class_ops,
	595	.id = "sfq",
	596	.priv_size = sizeof(struct sfq_sched_data),
	597	.enqueue = sfq_enqueue,
	598	.dequeue = sfq_dequeue,
	599	.peek = sfq_peek,
	600	.drop = sfq_drop,
	601	.init = sfq_init,
	602	.reset = sfq_reset,
	603	.destroy = sfq_destroy,
	604	.change = NULL,
	605	.dump = sfq_dump,
	606	.owner = THIS_MODULE,
	607	};
	608
	609	static int __init sfq_module_init(void)
	610	{
	611	return register_qdisc(&sfq_qdisc_ops);
	612	}
	613	static void __exit sfq_module_exit(void)
	614	{
	615	unregister_qdisc(&sfq_qdisc_ops);
	616	}
	617	module_init(sfq_module_init)
	618	module_exit(sfq_module_exit)
	619	MODULE_LICENSE("GPL");