]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/sched/sch_sfq.c
net: skb->dst accessors
[net-next-2.6.git] / net / sched / sch_sfq.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_sfq.c Stochastic Fairness Queueing discipline.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 */
11
1da177e4 12#include <linux/module.h>
1da177e4
LT
13#include <linux/types.h>
14#include <linux/kernel.h>
15#include <linux/jiffies.h>
16#include <linux/string.h>
1da177e4
LT
17#include <linux/in.h>
18#include <linux/errno.h>
1da177e4 19#include <linux/init.h>
1da177e4 20#include <linux/ipv6.h>
1da177e4 21#include <linux/skbuff.h>
32740ddc 22#include <linux/jhash.h>
0ba48053
PM
23#include <net/ip.h>
24#include <net/netlink.h>
1da177e4
LT
25#include <net/pkt_sched.h>
26
27
28/* Stochastic Fairness Queuing algorithm.
29 =======================================
30
31 Source:
32 Paul E. McKenney "Stochastic Fairness Queuing",
33 IEEE INFOCOMM'90 Proceedings, San Francisco, 1990.
34
35 Paul E. McKenney "Stochastic Fairness Queuing",
36 "Interworking: Research and Experience", v.2, 1991, p.113-131.
37
38
39 See also:
40 M. Shreedhar and George Varghese "Efficient Fair
41 Queuing using Deficit Round Robin", Proc. SIGCOMM 95.
42
43
10297b99 44 This is not the thing that is usually called (W)FQ nowadays.
1da177e4
LT
45 It does not use any timestamp mechanism, but instead
46 processes queues in round-robin order.
47
48 ADVANTAGE:
49
50 - It is very cheap. Both CPU and memory requirements are minimal.
51
52 DRAWBACKS:
53
10297b99 54 - "Stochastic" -> It is not 100% fair.
1da177e4
LT
55 When hash collisions occur, several flows are considered as one.
56
57 - "Round-robin" -> It introduces larger delays than virtual clock
58 based schemes, and should not be used for isolating interactive
59 traffic from non-interactive. It means, that this scheduler
60 should be used as leaf of CBQ or P3, which put interactive traffic
61 to higher priority band.
62
63 We still need true WFQ for top level CSZ, but using WFQ
64 for the best effort traffic is absolutely pointless:
65 SFQ is superior for this purpose.
66
67 IMPLEMENTATION:
68 This implementation limits maximal queue length to 128;
69 maximal mtu to 2^15-1; number of hash buckets to 1024.
70 The only goal of this restrictions was that all data
71 fit into one 4K page :-). Struct sfq_sched_data is
72 organized in anti-cache manner: all the data for a bucket
73 are scattered over different locations. This is not good,
74 but it allowed me to put it into 4K.
75
76 It is easy to increase these values, but not in flight. */
77
78#define SFQ_DEPTH 128
79#define SFQ_HASH_DIVISOR 1024
80
81/* This type should contain at least SFQ_DEPTH*2 values */
82typedef unsigned char sfq_index;
83
84struct sfq_head
85{
86 sfq_index next;
87 sfq_index prev;
88};
89
90struct sfq_sched_data
91{
92/* Parameters */
93 int perturb_period;
94 unsigned quantum; /* Allotment per round: MUST BE >= MTU */
95 int limit;
96
97/* Variables */
7d2681a6 98 struct tcf_proto *filter_list;
1da177e4 99 struct timer_list perturb_timer;
32740ddc 100 u32 perturbation;
1da177e4
LT
101 sfq_index tail; /* Index of current slot in round */
102 sfq_index max_depth; /* Maximal depth */
103
104 sfq_index ht[SFQ_HASH_DIVISOR]; /* Hash table */
105 sfq_index next[SFQ_DEPTH]; /* Active slots link */
106 short allot[SFQ_DEPTH]; /* Current allotment per slot */
107 unsigned short hash[SFQ_DEPTH]; /* Hash value indexed by slots */
108 struct sk_buff_head qs[SFQ_DEPTH]; /* Slot queue */
109 struct sfq_head dep[SFQ_DEPTH*2]; /* Linked list of slots, indexed by depth */
110};
111
112static __inline__ unsigned sfq_fold_hash(struct sfq_sched_data *q, u32 h, u32 h1)
113{
32740ddc 114 return jhash_2words(h, h1, q->perturbation) & (SFQ_HASH_DIVISOR - 1);
1da177e4
LT
115}
116
117static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb)
118{
119 u32 h, h2;
120
121 switch (skb->protocol) {
60678040 122 case htons(ETH_P_IP):
1da177e4 123 {
eddc9ec5 124 const struct iphdr *iph = ip_hdr(skb);
1da177e4 125 h = iph->daddr;
6f9e98f7 126 h2 = iph->saddr ^ iph->protocol;
1da177e4
LT
127 if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) &&
128 (iph->protocol == IPPROTO_TCP ||
129 iph->protocol == IPPROTO_UDP ||
a8d0f952 130 iph->protocol == IPPROTO_UDPLITE ||
ae82af54
PM
131 iph->protocol == IPPROTO_SCTP ||
132 iph->protocol == IPPROTO_DCCP ||
1da177e4
LT
133 iph->protocol == IPPROTO_ESP))
134 h2 ^= *(((u32*)iph) + iph->ihl);
135 break;
136 }
60678040 137 case htons(ETH_P_IPV6):
1da177e4 138 {
0660e03f 139 struct ipv6hdr *iph = ipv6_hdr(skb);
1da177e4 140 h = iph->daddr.s6_addr32[3];
6f9e98f7 141 h2 = iph->saddr.s6_addr32[3] ^ iph->nexthdr;
1da177e4
LT
142 if (iph->nexthdr == IPPROTO_TCP ||
143 iph->nexthdr == IPPROTO_UDP ||
a8d0f952 144 iph->nexthdr == IPPROTO_UDPLITE ||
ae82af54
PM
145 iph->nexthdr == IPPROTO_SCTP ||
146 iph->nexthdr == IPPROTO_DCCP ||
1da177e4
LT
147 iph->nexthdr == IPPROTO_ESP)
148 h2 ^= *(u32*)&iph[1];
149 break;
150 }
151 default:
adf30907 152 h = (unsigned long)skb_dst(skb) ^ skb->protocol;
6f9e98f7 153 h2 = (unsigned long)skb->sk;
1da177e4 154 }
6f9e98f7 155
1da177e4
LT
156 return sfq_fold_hash(q, h, h2);
157}
158
7d2681a6
PM
159static unsigned int sfq_classify(struct sk_buff *skb, struct Qdisc *sch,
160 int *qerr)
161{
162 struct sfq_sched_data *q = qdisc_priv(sch);
163 struct tcf_result res;
164 int result;
165
166 if (TC_H_MAJ(skb->priority) == sch->handle &&
167 TC_H_MIN(skb->priority) > 0 &&
168 TC_H_MIN(skb->priority) <= SFQ_HASH_DIVISOR)
169 return TC_H_MIN(skb->priority);
170
171 if (!q->filter_list)
172 return sfq_hash(q, skb) + 1;
173
c27f339a 174 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
7d2681a6
PM
175 result = tc_classify(skb, q->filter_list, &res);
176 if (result >= 0) {
177#ifdef CONFIG_NET_CLS_ACT
178 switch (result) {
179 case TC_ACT_STOLEN:
180 case TC_ACT_QUEUED:
378a2f09 181 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
7d2681a6
PM
182 case TC_ACT_SHOT:
183 return 0;
184 }
185#endif
186 if (TC_H_MIN(res.classid) <= SFQ_HASH_DIVISOR)
187 return TC_H_MIN(res.classid);
188 }
189 return 0;
190}
191
1da177e4
LT
192static inline void sfq_link(struct sfq_sched_data *q, sfq_index x)
193{
194 sfq_index p, n;
195 int d = q->qs[x].qlen + SFQ_DEPTH;
196
197 p = d;
198 n = q->dep[d].next;
199 q->dep[x].next = n;
200 q->dep[x].prev = p;
201 q->dep[p].next = q->dep[n].prev = x;
202}
203
204static inline void sfq_dec(struct sfq_sched_data *q, sfq_index x)
205{
206 sfq_index p, n;
207
208 n = q->dep[x].next;
209 p = q->dep[x].prev;
210 q->dep[p].next = n;
211 q->dep[n].prev = p;
212
213 if (n == p && q->max_depth == q->qs[x].qlen + 1)
214 q->max_depth--;
215
216 sfq_link(q, x);
217}
218
219static inline void sfq_inc(struct sfq_sched_data *q, sfq_index x)
220{
221 sfq_index p, n;
222 int d;
223
224 n = q->dep[x].next;
225 p = q->dep[x].prev;
226 q->dep[p].next = n;
227 q->dep[n].prev = p;
228 d = q->qs[x].qlen;
229 if (q->max_depth < d)
230 q->max_depth = d;
231
232 sfq_link(q, x);
233}
234
235static unsigned int sfq_drop(struct Qdisc *sch)
236{
237 struct sfq_sched_data *q = qdisc_priv(sch);
238 sfq_index d = q->max_depth;
239 struct sk_buff *skb;
240 unsigned int len;
241
242 /* Queue is full! Find the longest slot and
243 drop a packet from it */
244
245 if (d > 1) {
6f9e98f7 246 sfq_index x = q->dep[d + SFQ_DEPTH].next;
1da177e4 247 skb = q->qs[x].prev;
0abf77e5 248 len = qdisc_pkt_len(skb);
1da177e4
LT
249 __skb_unlink(skb, &q->qs[x]);
250 kfree_skb(skb);
251 sfq_dec(q, x);
252 sch->q.qlen--;
253 sch->qstats.drops++;
f5539eb8 254 sch->qstats.backlog -= len;
1da177e4
LT
255 return len;
256 }
257
258 if (d == 1) {
259 /* It is difficult to believe, but ALL THE SLOTS HAVE LENGTH 1. */
260 d = q->next[q->tail];
261 q->next[q->tail] = q->next[d];
262 q->allot[q->next[d]] += q->quantum;
263 skb = q->qs[d].prev;
0abf77e5 264 len = qdisc_pkt_len(skb);
1da177e4
LT
265 __skb_unlink(skb, &q->qs[d]);
266 kfree_skb(skb);
267 sfq_dec(q, d);
268 sch->q.qlen--;
269 q->ht[q->hash[d]] = SFQ_DEPTH;
270 sch->qstats.drops++;
f5539eb8 271 sch->qstats.backlog -= len;
1da177e4
LT
272 return len;
273 }
274
275 return 0;
276}
277
278static int
6f9e98f7 279sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
1da177e4
LT
280{
281 struct sfq_sched_data *q = qdisc_priv(sch);
7d2681a6 282 unsigned int hash;
1da177e4 283 sfq_index x;
7f3ff4f6 284 int uninitialized_var(ret);
7d2681a6
PM
285
286 hash = sfq_classify(skb, sch, &ret);
287 if (hash == 0) {
c27f339a 288 if (ret & __NET_XMIT_BYPASS)
7d2681a6
PM
289 sch->qstats.drops++;
290 kfree_skb(skb);
291 return ret;
292 }
293 hash--;
1da177e4
LT
294
295 x = q->ht[hash];
296 if (x == SFQ_DEPTH) {
297 q->ht[hash] = x = q->dep[SFQ_DEPTH].next;
298 q->hash[x] = hash;
299 }
6f9e98f7 300
32740ddc
AK
301 /* If selected queue has length q->limit, this means that
302 * all another queues are empty and that we do simple tail drop,
303 * i.e. drop _this_ packet.
304 */
305 if (q->qs[x].qlen >= q->limit)
306 return qdisc_drop(skb, sch);
307
0abf77e5 308 sch->qstats.backlog += qdisc_pkt_len(skb);
1da177e4
LT
309 __skb_queue_tail(&q->qs[x], skb);
310 sfq_inc(q, x);
311 if (q->qs[x].qlen == 1) { /* The flow is new */
312 if (q->tail == SFQ_DEPTH) { /* It is the first flow */
313 q->tail = x;
314 q->next[x] = x;
315 q->allot[x] = q->quantum;
316 } else {
317 q->next[x] = q->next[q->tail];
318 q->next[q->tail] = x;
319 q->tail = x;
320 }
321 }
5588b40d 322 if (++sch->q.qlen <= q->limit) {
0abf77e5 323 sch->bstats.bytes += qdisc_pkt_len(skb);
1da177e4
LT
324 sch->bstats.packets++;
325 return 0;
326 }
327
328 sfq_drop(sch);
329 return NET_XMIT_CN;
330}
331
48a8f519
PM
332static struct sk_buff *
333sfq_peek(struct Qdisc *sch)
334{
335 struct sfq_sched_data *q = qdisc_priv(sch);
336 sfq_index a;
1da177e4 337
48a8f519
PM
338 /* No active slots */
339 if (q->tail == SFQ_DEPTH)
340 return NULL;
1da177e4 341
48a8f519
PM
342 a = q->next[q->tail];
343 return skb_peek(&q->qs[a]);
344}
1da177e4
LT
345
346static struct sk_buff *
6f9e98f7 347sfq_dequeue(struct Qdisc *sch)
1da177e4
LT
348{
349 struct sfq_sched_data *q = qdisc_priv(sch);
350 struct sk_buff *skb;
351 sfq_index a, old_a;
352
353 /* No active slots */
354 if (q->tail == SFQ_DEPTH)
355 return NULL;
356
357 a = old_a = q->next[q->tail];
358
359 /* Grab packet */
360 skb = __skb_dequeue(&q->qs[a]);
361 sfq_dec(q, a);
362 sch->q.qlen--;
0abf77e5 363 sch->qstats.backlog -= qdisc_pkt_len(skb);
1da177e4
LT
364
365 /* Is the slot empty? */
366 if (q->qs[a].qlen == 0) {
367 q->ht[q->hash[a]] = SFQ_DEPTH;
368 a = q->next[a];
369 if (a == old_a) {
370 q->tail = SFQ_DEPTH;
371 return skb;
372 }
373 q->next[q->tail] = a;
374 q->allot[a] += q->quantum;
0abf77e5 375 } else if ((q->allot[a] -= qdisc_pkt_len(skb)) <= 0) {
1da177e4
LT
376 q->tail = a;
377 a = q->next[a];
378 q->allot[a] += q->quantum;
379 }
380 return skb;
381}
382
383static void
6f9e98f7 384sfq_reset(struct Qdisc *sch)
1da177e4
LT
385{
386 struct sk_buff *skb;
387
388 while ((skb = sfq_dequeue(sch)) != NULL)
389 kfree_skb(skb);
390}
391
392static void sfq_perturbation(unsigned long arg)
393{
6f9e98f7 394 struct Qdisc *sch = (struct Qdisc *)arg;
1da177e4
LT
395 struct sfq_sched_data *q = qdisc_priv(sch);
396
d46f8dd8 397 q->perturbation = net_random();
1da177e4 398
32740ddc
AK
399 if (q->perturb_period)
400 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
1da177e4
LT
401}
402
1e90474c 403static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
1da177e4
LT
404{
405 struct sfq_sched_data *q = qdisc_priv(sch);
1e90474c 406 struct tc_sfq_qopt *ctl = nla_data(opt);
5e50da01 407 unsigned int qlen;
1da177e4 408
1e90474c 409 if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
1da177e4
LT
410 return -EINVAL;
411
412 sch_tree_lock(sch);
5ce2d488 413 q->quantum = ctl->quantum ? : psched_mtu(qdisc_dev(sch));
6f9e98f7 414 q->perturb_period = ctl->perturb_period * HZ;
1da177e4 415 if (ctl->limit)
32740ddc 416 q->limit = min_t(u32, ctl->limit, SFQ_DEPTH - 1);
1da177e4 417
5e50da01 418 qlen = sch->q.qlen;
5588b40d 419 while (sch->q.qlen > q->limit)
1da177e4 420 sfq_drop(sch);
5e50da01 421 qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
1da177e4
LT
422
423 del_timer(&q->perturb_timer);
424 if (q->perturb_period) {
32740ddc 425 mod_timer(&q->perturb_timer, jiffies + q->perturb_period);
d46f8dd8 426 q->perturbation = net_random();
1da177e4
LT
427 }
428 sch_tree_unlock(sch);
429 return 0;
430}
431
1e90474c 432static int sfq_init(struct Qdisc *sch, struct nlattr *opt)
1da177e4
LT
433{
434 struct sfq_sched_data *q = qdisc_priv(sch);
435 int i;
436
d3e99483 437 q->perturb_timer.function = sfq_perturbation;
c19a28e1 438 q->perturb_timer.data = (unsigned long)sch;
d3e99483 439 init_timer_deferrable(&q->perturb_timer);
1da177e4 440
6f9e98f7 441 for (i = 0; i < SFQ_HASH_DIVISOR; i++)
1da177e4 442 q->ht[i] = SFQ_DEPTH;
6f9e98f7
SH
443
444 for (i = 0; i < SFQ_DEPTH; i++) {
1da177e4 445 skb_queue_head_init(&q->qs[i]);
6f9e98f7
SH
446 q->dep[i + SFQ_DEPTH].next = i + SFQ_DEPTH;
447 q->dep[i + SFQ_DEPTH].prev = i + SFQ_DEPTH;
1da177e4 448 }
6f9e98f7 449
32740ddc 450 q->limit = SFQ_DEPTH - 1;
1da177e4
LT
451 q->max_depth = 0;
452 q->tail = SFQ_DEPTH;
453 if (opt == NULL) {
5ce2d488 454 q->quantum = psched_mtu(qdisc_dev(sch));
1da177e4 455 q->perturb_period = 0;
d46f8dd8 456 q->perturbation = net_random();
1da177e4
LT
457 } else {
458 int err = sfq_change(sch, opt);
459 if (err)
460 return err;
461 }
6f9e98f7
SH
462
463 for (i = 0; i < SFQ_DEPTH; i++)
1da177e4
LT
464 sfq_link(q, i);
465 return 0;
466}
467
468static void sfq_destroy(struct Qdisc *sch)
469{
470 struct sfq_sched_data *q = qdisc_priv(sch);
7d2681a6 471
ff31ab56 472 tcf_destroy_chain(&q->filter_list);
980c478d
JP
473 q->perturb_period = 0;
474 del_timer_sync(&q->perturb_timer);
1da177e4
LT
475}
476
477static int sfq_dump(struct Qdisc *sch, struct sk_buff *skb)
478{
479 struct sfq_sched_data *q = qdisc_priv(sch);
27a884dc 480 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
481 struct tc_sfq_qopt opt;
482
483 opt.quantum = q->quantum;
6f9e98f7 484 opt.perturb_period = q->perturb_period / HZ;
1da177e4
LT
485
486 opt.limit = q->limit;
487 opt.divisor = SFQ_HASH_DIVISOR;
cdec7e50 488 opt.flows = q->limit;
1da177e4 489
1e90474c 490 NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
1da177e4
LT
491
492 return skb->len;
493
1e90474c 494nla_put_failure:
dc5fc579 495 nlmsg_trim(skb, b);
1da177e4
LT
496 return -1;
497}
498
7d2681a6
PM
499static int sfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
500 struct nlattr **tca, unsigned long *arg)
501{
502 return -EOPNOTSUPP;
503}
504
505static unsigned long sfq_get(struct Qdisc *sch, u32 classid)
506{
507 return 0;
508}
509
510static struct tcf_proto **sfq_find_tcf(struct Qdisc *sch, unsigned long cl)
511{
512 struct sfq_sched_data *q = qdisc_priv(sch);
513
514 if (cl)
515 return NULL;
516 return &q->filter_list;
517}
518
94de78d1
PM
519static int sfq_dump_class(struct Qdisc *sch, unsigned long cl,
520 struct sk_buff *skb, struct tcmsg *tcm)
521{
522 tcm->tcm_handle |= TC_H_MIN(cl);
523 return 0;
524}
525
526static int sfq_dump_class_stats(struct Qdisc *sch, unsigned long cl,
527 struct gnet_dump *d)
528{
529 struct sfq_sched_data *q = qdisc_priv(sch);
530 sfq_index idx = q->ht[cl-1];
531 struct gnet_stats_queue qs = { .qlen = q->qs[idx].qlen };
532 struct tc_sfq_xstats xstats = { .allot = q->allot[idx] };
533
534 if (gnet_stats_copy_queue(d, &qs) < 0)
535 return -1;
536 return gnet_stats_copy_app(d, &xstats, sizeof(xstats));
537}
538
7d2681a6
PM
539static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
540{
94de78d1
PM
541 struct sfq_sched_data *q = qdisc_priv(sch);
542 unsigned int i;
543
544 if (arg->stop)
545 return;
546
547 for (i = 0; i < SFQ_HASH_DIVISOR; i++) {
548 if (q->ht[i] == SFQ_DEPTH ||
549 arg->count < arg->skip) {
550 arg->count++;
551 continue;
552 }
553 if (arg->fn(sch, i + 1, arg) < 0) {
554 arg->stop = 1;
555 break;
556 }
557 arg->count++;
558 }
7d2681a6
PM
559}
560
561static const struct Qdisc_class_ops sfq_class_ops = {
562 .get = sfq_get,
563 .change = sfq_change_class,
564 .tcf_chain = sfq_find_tcf,
94de78d1
PM
565 .dump = sfq_dump_class,
566 .dump_stats = sfq_dump_class_stats,
7d2681a6
PM
567 .walk = sfq_walk,
568};
569
20fea08b 570static struct Qdisc_ops sfq_qdisc_ops __read_mostly = {
7d2681a6 571 .cl_ops = &sfq_class_ops,
1da177e4
LT
572 .id = "sfq",
573 .priv_size = sizeof(struct sfq_sched_data),
574 .enqueue = sfq_enqueue,
575 .dequeue = sfq_dequeue,
48a8f519 576 .peek = sfq_peek,
1da177e4
LT
577 .drop = sfq_drop,
578 .init = sfq_init,
579 .reset = sfq_reset,
580 .destroy = sfq_destroy,
581 .change = NULL,
582 .dump = sfq_dump,
583 .owner = THIS_MODULE,
584};
585
586static int __init sfq_module_init(void)
587{
588 return register_qdisc(&sfq_qdisc_ops);
589}
10297b99 590static void __exit sfq_module_exit(void)
1da177e4
LT
591{
592 unregister_qdisc(&sfq_qdisc_ops);
593}
594module_init(sfq_module_init)
595module_exit(sfq_module_exit)
596MODULE_LICENSE("GPL");