]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/sched/sch_api.c
8139cp: fix checksum broken
[net-next-2.6.git] / net / sched / sch_api.c
CommitLineData
1da177e4
LT
1/*
2 * net/sched/sch_api.c Packet scheduler API.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10 *
11 * Fixes:
12 *
13 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16 */
17
1da177e4
LT
18#include <linux/module.h>
19#include <linux/types.h>
20#include <linux/kernel.h>
1da177e4 21#include <linux/string.h>
1da177e4 22#include <linux/errno.h>
1da177e4 23#include <linux/skbuff.h>
1da177e4
LT
24#include <linux/init.h>
25#include <linux/proc_fs.h>
26#include <linux/seq_file.h>
27#include <linux/kmod.h>
28#include <linux/list.h>
4179477f 29#include <linux/hrtimer.h>
25bfcd5a 30#include <linux/lockdep.h>
5a0e3ad6 31#include <linux/slab.h>
1da177e4 32
457c4cbc 33#include <net/net_namespace.h>
b854272b 34#include <net/sock.h>
dc5fc579 35#include <net/netlink.h>
1da177e4
LT
36#include <net/pkt_sched.h>
37
7316ae88
TG
38static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39 struct nlmsghdr *n, u32 clid,
1da177e4 40 struct Qdisc *old, struct Qdisc *new);
7316ae88
TG
41static int tclass_notify(struct net *net, struct sk_buff *oskb,
42 struct nlmsghdr *n, struct Qdisc *q,
43 unsigned long cl, int event);
1da177e4
LT
44
45/*
46
47 Short review.
48 -------------
49
50 This file consists of two interrelated parts:
51
52 1. queueing disciplines manager frontend.
53 2. traffic classes manager frontend.
54
55 Generally, queueing discipline ("qdisc") is a black box,
56 which is able to enqueue packets and to dequeue them (when
57 device is ready to send something) in order and at times
58 determined by algorithm hidden in it.
59
60 qdisc's are divided to two categories:
61 - "queues", which have no internal structure visible from outside.
62 - "schedulers", which split all the packets to "traffic classes",
63 using "packet classifiers" (look at cls_api.c)
64
65 In turn, classes may have child qdiscs (as rule, queues)
66 attached to them etc. etc. etc.
67
68 The goal of the routines in this file is to translate
69 information supplied by user in the form of handles
70 to more intelligible for kernel form, to make some sanity
71 checks and part of work, which is common to all qdiscs
72 and to provide rtnetlink notifications.
73
74 All real intelligent work is done inside qdisc modules.
75
76
77
78 Every discipline has two major routines: enqueue and dequeue.
79
80 ---dequeue
81
82 dequeue usually returns a skb to send. It is allowed to return NULL,
83 but it does not mean that queue is empty, it just means that
84 discipline does not want to send anything this time.
85 Queue is really empty if q->q.qlen == 0.
86 For complicated disciplines with multiple queues q->q is not
87 real packet queue, but however q->q.qlen must be valid.
88
89 ---enqueue
90
91 enqueue returns 0, if packet was enqueued successfully.
92 If packet (this one or another one) was dropped, it returns
93 not zero error code.
94 NET_XMIT_DROP - this packet dropped
95 Expected action: do not backoff, but wait until queue will clear.
96 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
97 Expected action: backoff or ignore
98 NET_XMIT_POLICED - dropped by police.
99 Expected action: backoff or error to real-time apps.
100
101 Auxiliary routines:
102
99c0db26
JP
103 ---peek
104
105 like dequeue but without removing a packet from the queue
106
1da177e4
LT
107 ---reset
108
109 returns qdisc to initial state: purge all buffers, clear all
110 timers, counters (except for statistics) etc.
111
112 ---init
113
114 initializes newly created qdisc.
115
116 ---destroy
117
118 destroys resources allocated by init and during lifetime of qdisc.
119
120 ---change
121
122 changes qdisc parameters.
123 */
124
125/* Protects list of registered TC modules. It is pure SMP lock. */
126static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129/************************************************
130 * Queueing disciplines manipulation. *
131 ************************************************/
132
133
134/* The list of all installed queueing disciplines. */
135
136static struct Qdisc_ops *qdisc_base;
137
138/* Register/uregister queueing discipline */
139
140int register_qdisc(struct Qdisc_ops *qops)
141{
142 struct Qdisc_ops *q, **qp;
143 int rc = -EEXIST;
144
145 write_lock(&qdisc_mod_lock);
146 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147 if (!strcmp(qops->id, q->id))
148 goto out;
149
150 if (qops->enqueue == NULL)
151 qops->enqueue = noop_qdisc_ops.enqueue;
99c0db26 152 if (qops->peek == NULL) {
68fd26b5 153 if (qops->dequeue == NULL)
99c0db26 154 qops->peek = noop_qdisc_ops.peek;
68fd26b5
JP
155 else
156 goto out_einval;
99c0db26 157 }
1da177e4
LT
158 if (qops->dequeue == NULL)
159 qops->dequeue = noop_qdisc_ops.dequeue;
160
68fd26b5
JP
161 if (qops->cl_ops) {
162 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
3e9e5a59 164 if (!(cops->get && cops->put && cops->walk && cops->leaf))
68fd26b5
JP
165 goto out_einval;
166
167 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168 goto out_einval;
169 }
170
1da177e4
LT
171 qops->next = NULL;
172 *qp = qops;
173 rc = 0;
174out:
175 write_unlock(&qdisc_mod_lock);
176 return rc;
68fd26b5
JP
177
178out_einval:
179 rc = -EINVAL;
180 goto out;
1da177e4 181}
62e3ba1b 182EXPORT_SYMBOL(register_qdisc);
1da177e4
LT
183
184int unregister_qdisc(struct Qdisc_ops *qops)
185{
186 struct Qdisc_ops *q, **qp;
187 int err = -ENOENT;
188
189 write_lock(&qdisc_mod_lock);
190 for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
191 if (q == qops)
192 break;
193 if (q) {
194 *qp = q->next;
195 q->next = NULL;
196 err = 0;
197 }
198 write_unlock(&qdisc_mod_lock);
199 return err;
200}
62e3ba1b 201EXPORT_SYMBOL(unregister_qdisc);
1da177e4
LT
202
203/* We know handle. Find qdisc among all qdisc's attached to device
204 (root qdisc, all its children, children of children etc.)
205 */
206
6113b748 207static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
8123b421
DM
208{
209 struct Qdisc *q;
210
211 if (!(root->flags & TCQ_F_BUILTIN) &&
212 root->handle == handle)
213 return root;
214
215 list_for_each_entry(q, &root->list, list) {
216 if (q->handle == handle)
217 return q;
218 }
219 return NULL;
220}
221
f6e0b239
JP
222static void qdisc_list_add(struct Qdisc *q)
223{
f6486d40 224 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
af356afa 225 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
f6e0b239
JP
226}
227
228void qdisc_list_del(struct Qdisc *q)
229{
f6486d40 230 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
f6e0b239 231 list_del(&q->list);
f6e0b239
JP
232}
233EXPORT_SYMBOL(qdisc_list_del);
234
ead81cc5 235struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
1da177e4 236{
f6e0b239
JP
237 struct Qdisc *q;
238
af356afa
PM
239 q = qdisc_match_from_root(dev->qdisc, handle);
240 if (q)
241 goto out;
f6e0b239 242
24824a09
ED
243 if (dev_ingress_queue(dev))
244 q = qdisc_match_from_root(
245 dev_ingress_queue(dev)->qdisc_sleeping,
246 handle);
f6486d40 247out:
f6e0b239 248 return q;
1da177e4
LT
249}
250
251static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252{
253 unsigned long cl;
254 struct Qdisc *leaf;
20fea08b 255 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
1da177e4
LT
256
257 if (cops == NULL)
258 return NULL;
259 cl = cops->get(p, classid);
260
261 if (cl == 0)
262 return NULL;
263 leaf = cops->leaf(p, cl);
264 cops->put(p, cl);
265 return leaf;
266}
267
268/* Find queueing discipline by name */
269
1e90474c 270static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
1da177e4
LT
271{
272 struct Qdisc_ops *q = NULL;
273
274 if (kind) {
275 read_lock(&qdisc_mod_lock);
276 for (q = qdisc_base; q; q = q->next) {
1e90474c 277 if (nla_strcmp(kind, q->id) == 0) {
1da177e4
LT
278 if (!try_module_get(q->owner))
279 q = NULL;
280 break;
281 }
282 }
283 read_unlock(&qdisc_mod_lock);
284 }
285 return q;
286}
287
288static struct qdisc_rate_table *qdisc_rtab_list;
289
1e90474c 290struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
1da177e4
LT
291{
292 struct qdisc_rate_table *rtab;
293
294 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
295 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
296 rtab->refcnt++;
297 return rtab;
298 }
299 }
300
5feb5e1a
PM
301 if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
302 nla_len(tab) != TC_RTAB_SIZE)
1da177e4
LT
303 return NULL;
304
305 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
306 if (rtab) {
307 rtab->rate = *r;
308 rtab->refcnt = 1;
1e90474c 309 memcpy(rtab->data, nla_data(tab), 1024);
1da177e4
LT
310 rtab->next = qdisc_rtab_list;
311 qdisc_rtab_list = rtab;
312 }
313 return rtab;
314}
62e3ba1b 315EXPORT_SYMBOL(qdisc_get_rtab);
1da177e4
LT
316
317void qdisc_put_rtab(struct qdisc_rate_table *tab)
318{
319 struct qdisc_rate_table *rtab, **rtabp;
320
321 if (!tab || --tab->refcnt)
322 return;
323
324 for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
325 if (rtab == tab) {
326 *rtabp = rtab->next;
327 kfree(rtab);
328 return;
329 }
330 }
331}
62e3ba1b 332EXPORT_SYMBOL(qdisc_put_rtab);
1da177e4 333
175f9c1b
JK
334static LIST_HEAD(qdisc_stab_list);
335static DEFINE_SPINLOCK(qdisc_stab_lock);
336
337static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
338 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
339 [TCA_STAB_DATA] = { .type = NLA_BINARY },
340};
341
342static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
343{
344 struct nlattr *tb[TCA_STAB_MAX + 1];
345 struct qdisc_size_table *stab;
346 struct tc_sizespec *s;
347 unsigned int tsize = 0;
348 u16 *tab = NULL;
349 int err;
350
351 err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
352 if (err < 0)
353 return ERR_PTR(err);
354 if (!tb[TCA_STAB_BASE])
355 return ERR_PTR(-EINVAL);
356
357 s = nla_data(tb[TCA_STAB_BASE]);
358
359 if (s->tsize > 0) {
360 if (!tb[TCA_STAB_DATA])
361 return ERR_PTR(-EINVAL);
362 tab = nla_data(tb[TCA_STAB_DATA]);
363 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
364 }
365
00093fab 366 if (tsize != s->tsize || (!tab && tsize > 0))
175f9c1b
JK
367 return ERR_PTR(-EINVAL);
368
f3b9605d 369 spin_lock(&qdisc_stab_lock);
175f9c1b
JK
370
371 list_for_each_entry(stab, &qdisc_stab_list, list) {
372 if (memcmp(&stab->szopts, s, sizeof(*s)))
373 continue;
374 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
375 continue;
376 stab->refcnt++;
f3b9605d 377 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
378 return stab;
379 }
380
f3b9605d 381 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
382
383 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
384 if (!stab)
385 return ERR_PTR(-ENOMEM);
386
387 stab->refcnt = 1;
388 stab->szopts = *s;
389 if (tsize > 0)
390 memcpy(stab->data, tab, tsize * sizeof(u16));
391
f3b9605d 392 spin_lock(&qdisc_stab_lock);
175f9c1b 393 list_add_tail(&stab->list, &qdisc_stab_list);
f3b9605d 394 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
395
396 return stab;
397}
398
399void qdisc_put_stab(struct qdisc_size_table *tab)
400{
401 if (!tab)
402 return;
403
f3b9605d 404 spin_lock(&qdisc_stab_lock);
175f9c1b
JK
405
406 if (--tab->refcnt == 0) {
407 list_del(&tab->list);
408 kfree(tab);
409 }
410
f3b9605d 411 spin_unlock(&qdisc_stab_lock);
175f9c1b
JK
412}
413EXPORT_SYMBOL(qdisc_put_stab);
414
415static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
416{
417 struct nlattr *nest;
418
419 nest = nla_nest_start(skb, TCA_STAB);
3aa4614d
PM
420 if (nest == NULL)
421 goto nla_put_failure;
175f9c1b
JK
422 NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
423 nla_nest_end(skb, nest);
424
425 return skb->len;
426
427nla_put_failure:
428 return -1;
429}
430
431void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
432{
433 int pkt_len, slot;
434
435 pkt_len = skb->len + stab->szopts.overhead;
436 if (unlikely(!stab->szopts.tsize))
437 goto out;
438
439 slot = pkt_len + stab->szopts.cell_align;
440 if (unlikely(slot < 0))
441 slot = 0;
442
443 slot >>= stab->szopts.cell_log;
444 if (likely(slot < stab->szopts.tsize))
445 pkt_len = stab->data[slot];
446 else
447 pkt_len = stab->data[stab->szopts.tsize - 1] *
448 (slot / stab->szopts.tsize) +
449 stab->data[slot % stab->szopts.tsize];
450
451 pkt_len <<= stab->szopts.size_log;
452out:
453 if (unlikely(pkt_len < 1))
454 pkt_len = 1;
455 qdisc_skb_cb(skb)->pkt_len = pkt_len;
456}
457EXPORT_SYMBOL(qdisc_calculate_pkt_len);
458
b00355db
JP
459void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
460{
461 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
462 printk(KERN_WARNING
463 "%s: %s qdisc %X: is non-work-conserving?\n",
464 txt, qdisc->ops->id, qdisc->handle >> 16);
465 qdisc->flags |= TCQ_F_WARN_NONWC;
466 }
467}
468EXPORT_SYMBOL(qdisc_warn_nonwc);
469
4179477f
PM
470static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
471{
472 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
2fbd3da3 473 timer);
4179477f
PM
474
475 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
8608db03 476 __netif_schedule(qdisc_root(wd->qdisc));
1936502d 477
4179477f
PM
478 return HRTIMER_NORESTART;
479}
480
481void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
482{
2fbd3da3
DM
483 hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
484 wd->timer.function = qdisc_watchdog;
4179477f
PM
485 wd->qdisc = qdisc;
486}
487EXPORT_SYMBOL(qdisc_watchdog_init);
488
489void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
490{
491 ktime_t time;
492
2540e051
JP
493 if (test_bit(__QDISC_STATE_DEACTIVATED,
494 &qdisc_root_sleeping(wd->qdisc)->state))
495 return;
496
4179477f
PM
497 wd->qdisc->flags |= TCQ_F_THROTTLED;
498 time = ktime_set(0, 0);
ca44d6e6 499 time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
2fbd3da3 500 hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
4179477f
PM
501}
502EXPORT_SYMBOL(qdisc_watchdog_schedule);
503
504void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
505{
2fbd3da3 506 hrtimer_cancel(&wd->timer);
4179477f
PM
507 wd->qdisc->flags &= ~TCQ_F_THROTTLED;
508}
509EXPORT_SYMBOL(qdisc_watchdog_cancel);
1da177e4 510
a94f779f 511static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
6fe1c7a5
PM
512{
513 unsigned int size = n * sizeof(struct hlist_head), i;
514 struct hlist_head *h;
515
516 if (size <= PAGE_SIZE)
517 h = kmalloc(size, GFP_KERNEL);
518 else
519 h = (struct hlist_head *)
520 __get_free_pages(GFP_KERNEL, get_order(size));
521
522 if (h != NULL) {
523 for (i = 0; i < n; i++)
524 INIT_HLIST_HEAD(&h[i]);
525 }
526 return h;
527}
528
529static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
530{
531 unsigned int size = n * sizeof(struct hlist_head);
532
533 if (size <= PAGE_SIZE)
534 kfree(h);
535 else
536 free_pages((unsigned long)h, get_order(size));
537}
538
539void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
540{
541 struct Qdisc_class_common *cl;
542 struct hlist_node *n, *next;
543 struct hlist_head *nhash, *ohash;
544 unsigned int nsize, nmask, osize;
545 unsigned int i, h;
546
547 /* Rehash when load factor exceeds 0.75 */
548 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
549 return;
550 nsize = clhash->hashsize * 2;
551 nmask = nsize - 1;
552 nhash = qdisc_class_hash_alloc(nsize);
553 if (nhash == NULL)
554 return;
555
556 ohash = clhash->hash;
557 osize = clhash->hashsize;
558
559 sch_tree_lock(sch);
560 for (i = 0; i < osize; i++) {
561 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
562 h = qdisc_class_hash(cl->classid, nmask);
563 hlist_add_head(&cl->hnode, &nhash[h]);
564 }
565 }
566 clhash->hash = nhash;
567 clhash->hashsize = nsize;
568 clhash->hashmask = nmask;
569 sch_tree_unlock(sch);
570
571 qdisc_class_hash_free(ohash, osize);
572}
573EXPORT_SYMBOL(qdisc_class_hash_grow);
574
575int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
576{
577 unsigned int size = 4;
578
579 clhash->hash = qdisc_class_hash_alloc(size);
580 if (clhash->hash == NULL)
581 return -ENOMEM;
582 clhash->hashsize = size;
583 clhash->hashmask = size - 1;
584 clhash->hashelems = 0;
585 return 0;
586}
587EXPORT_SYMBOL(qdisc_class_hash_init);
588
589void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
590{
591 qdisc_class_hash_free(clhash->hash, clhash->hashsize);
592}
593EXPORT_SYMBOL(qdisc_class_hash_destroy);
594
595void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
596 struct Qdisc_class_common *cl)
597{
598 unsigned int h;
599
600 INIT_HLIST_NODE(&cl->hnode);
601 h = qdisc_class_hash(cl->classid, clhash->hashmask);
602 hlist_add_head(&cl->hnode, &clhash->hash[h]);
603 clhash->hashelems++;
604}
605EXPORT_SYMBOL(qdisc_class_hash_insert);
606
607void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
608 struct Qdisc_class_common *cl)
609{
610 hlist_del(&cl->hnode);
611 clhash->hashelems--;
612}
613EXPORT_SYMBOL(qdisc_class_hash_remove);
614
1da177e4
LT
615/* Allocate an unique handle from space managed by kernel */
616
617static u32 qdisc_alloc_handle(struct net_device *dev)
618{
619 int i = 0x10000;
620 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
621
622 do {
623 autohandle += TC_H_MAKE(0x10000U, 0);
624 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
625 autohandle = TC_H_MAKE(0x80000000U, 0);
626 } while (qdisc_lookup(dev, autohandle) && --i > 0);
627
628 return i>0 ? autohandle : 0;
629}
630
43effa1e
PM
631void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
632{
20fea08b 633 const struct Qdisc_class_ops *cops;
43effa1e
PM
634 unsigned long cl;
635 u32 parentid;
636
637 if (n == 0)
638 return;
639 while ((parentid = sch->parent)) {
066a3b5b
JP
640 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
641 return;
642
5ce2d488 643 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
ffc8fefa
PM
644 if (sch == NULL) {
645 WARN_ON(parentid != TC_H_ROOT);
646 return;
647 }
43effa1e
PM
648 cops = sch->ops->cl_ops;
649 if (cops->qlen_notify) {
650 cl = cops->get(sch, parentid);
651 cops->qlen_notify(sch, cl);
652 cops->put(sch, cl);
653 }
654 sch->q.qlen -= n;
655 }
656}
657EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
1da177e4 658
7316ae88
TG
659static void notify_and_destroy(struct net *net, struct sk_buff *skb,
660 struct nlmsghdr *n, u32 clid,
99194cff
DM
661 struct Qdisc *old, struct Qdisc *new)
662{
663 if (new || old)
7316ae88 664 qdisc_notify(net, skb, n, clid, old, new);
1da177e4 665
4d8863a2 666 if (old)
99194cff 667 qdisc_destroy(old);
99194cff
DM
668}
669
670/* Graft qdisc "new" to class "classid" of qdisc "parent" or
671 * to device "dev".
672 *
673 * When appropriate send a netlink notification using 'skb'
674 * and "n".
675 *
676 * On success, destroy old qdisc.
1da177e4
LT
677 */
678
679static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
99194cff
DM
680 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
681 struct Qdisc *new, struct Qdisc *old)
1da177e4 682{
99194cff 683 struct Qdisc *q = old;
7316ae88 684 struct net *net = dev_net(dev);
1da177e4 685 int err = 0;
1da177e4 686
10297b99 687 if (parent == NULL) {
99194cff
DM
688 unsigned int i, num_q, ingress;
689
690 ingress = 0;
691 num_q = dev->num_tx_queues;
8d50b53d
DM
692 if ((q && q->flags & TCQ_F_INGRESS) ||
693 (new && new->flags & TCQ_F_INGRESS)) {
99194cff
DM
694 num_q = 1;
695 ingress = 1;
24824a09
ED
696 if (!dev_ingress_queue(dev))
697 return -ENOENT;
99194cff
DM
698 }
699
700 if (dev->flags & IFF_UP)
701 dev_deactivate(dev);
702
6ec1c69a
DM
703 if (new && new->ops->attach) {
704 new->ops->attach(new);
705 num_q = 0;
706 }
707
99194cff 708 for (i = 0; i < num_q; i++) {
24824a09 709 struct netdev_queue *dev_queue = dev_ingress_queue(dev);
99194cff
DM
710
711 if (!ingress)
712 dev_queue = netdev_get_tx_queue(dev, i);
713
8d50b53d
DM
714 old = dev_graft_qdisc(dev_queue, new);
715 if (new && i > 0)
716 atomic_inc(&new->refcnt);
717
036d6a67
JP
718 if (!ingress)
719 qdisc_destroy(old);
1da177e4 720 }
99194cff 721
036d6a67 722 if (!ingress) {
7316ae88
TG
723 notify_and_destroy(net, skb, n, classid,
724 dev->qdisc, new);
036d6a67
JP
725 if (new && !new->ops->attach)
726 atomic_inc(&new->refcnt);
727 dev->qdisc = new ? : &noop_qdisc;
728 } else {
7316ae88 729 notify_and_destroy(net, skb, n, classid, old, new);
036d6a67 730 }
af356afa 731
99194cff
DM
732 if (dev->flags & IFF_UP)
733 dev_activate(dev);
1da177e4 734 } else {
20fea08b 735 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1da177e4 736
c9f1d038
PM
737 err = -EOPNOTSUPP;
738 if (cops && cops->graft) {
1da177e4
LT
739 unsigned long cl = cops->get(parent, classid);
740 if (cl) {
99194cff 741 err = cops->graft(parent, cl, new, &old);
1da177e4 742 cops->put(parent, cl);
c9f1d038
PM
743 } else
744 err = -ENOENT;
1da177e4 745 }
99194cff 746 if (!err)
7316ae88 747 notify_and_destroy(net, skb, n, classid, old, new);
1da177e4
LT
748 }
749 return err;
750}
751
25bfcd5a
JP
752/* lockdep annotation is needed for ingress; egress gets it only for name */
753static struct lock_class_key qdisc_tx_lock;
754static struct lock_class_key qdisc_rx_lock;
755
1da177e4
LT
756/*
757 Allocate and initialize new qdisc.
758
759 Parameters are passed via opt.
760 */
761
762static struct Qdisc *
bb949fbd 763qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
23bcf634
PM
764 struct Qdisc *p, u32 parent, u32 handle,
765 struct nlattr **tca, int *errp)
1da177e4
LT
766{
767 int err;
1e90474c 768 struct nlattr *kind = tca[TCA_KIND];
1da177e4
LT
769 struct Qdisc *sch;
770 struct Qdisc_ops *ops;
175f9c1b 771 struct qdisc_size_table *stab;
1da177e4
LT
772
773 ops = qdisc_lookup_ops(kind);
95a5afca 774#ifdef CONFIG_MODULES
1da177e4
LT
775 if (ops == NULL && kind != NULL) {
776 char name[IFNAMSIZ];
1e90474c 777 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1da177e4
LT
778 /* We dropped the RTNL semaphore in order to
779 * perform the module load. So, even if we
780 * succeeded in loading the module we have to
781 * tell the caller to replay the request. We
782 * indicate this using -EAGAIN.
783 * We replay the request because the device may
784 * go away in the mean time.
785 */
786 rtnl_unlock();
787 request_module("sch_%s", name);
788 rtnl_lock();
789 ops = qdisc_lookup_ops(kind);
790 if (ops != NULL) {
791 /* We will try again qdisc_lookup_ops,
792 * so don't keep a reference.
793 */
794 module_put(ops->owner);
795 err = -EAGAIN;
796 goto err_out;
797 }
798 }
799 }
800#endif
801
b9e2cc0f 802 err = -ENOENT;
1da177e4
LT
803 if (ops == NULL)
804 goto err_out;
805
5ce2d488 806 sch = qdisc_alloc(dev_queue, ops);
3d54b82f
TG
807 if (IS_ERR(sch)) {
808 err = PTR_ERR(sch);
1da177e4 809 goto err_out2;
3d54b82f 810 }
1da177e4 811
ffc8fefa
PM
812 sch->parent = parent;
813
3d54b82f 814 if (handle == TC_H_INGRESS) {
1da177e4 815 sch->flags |= TCQ_F_INGRESS;
3d54b82f 816 handle = TC_H_MAKE(TC_H_INGRESS, 0);
25bfcd5a 817 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
fd44de7c 818 } else {
fd44de7c
PM
819 if (handle == 0) {
820 handle = qdisc_alloc_handle(dev);
821 err = -ENOMEM;
822 if (handle == 0)
823 goto err_out3;
824 }
25bfcd5a 825 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1da177e4
LT
826 }
827
3d54b82f 828 sch->handle = handle;
1da177e4 829
1e90474c 830 if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
175f9c1b
JK
831 if (tca[TCA_STAB]) {
832 stab = qdisc_get_stab(tca[TCA_STAB]);
833 if (IS_ERR(stab)) {
834 err = PTR_ERR(stab);
7c64b9f3 835 goto err_out4;
175f9c1b
JK
836 }
837 sch->stab = stab;
838 }
1e90474c 839 if (tca[TCA_RATE]) {
f6f9b93f
JP
840 spinlock_t *root_lock;
841
23bcf634
PM
842 err = -EOPNOTSUPP;
843 if (sch->flags & TCQ_F_MQROOT)
844 goto err_out4;
845
f6f9b93f 846 if ((sch->parent != TC_H_ROOT) &&
23bcf634
PM
847 !(sch->flags & TCQ_F_INGRESS) &&
848 (!p || !(p->flags & TCQ_F_MQROOT)))
f6f9b93f
JP
849 root_lock = qdisc_root_sleeping_lock(sch);
850 else
851 root_lock = qdisc_lock(sch);
852
023e09a7 853 err = gen_new_estimator(&sch->bstats, &sch->rate_est,
f6f9b93f 854 root_lock, tca[TCA_RATE]);
23bcf634
PM
855 if (err)
856 goto err_out4;
023e09a7 857 }
f6e0b239
JP
858
859 qdisc_list_add(sch);
1da177e4 860
1da177e4
LT
861 return sch;
862 }
863err_out3:
864 dev_put(dev);
3d54b82f 865 kfree((char *) sch - sch->padded);
1da177e4
LT
866err_out2:
867 module_put(ops->owner);
868err_out:
869 *errp = err;
1da177e4 870 return NULL;
23bcf634
PM
871
872err_out4:
873 /*
874 * Any broken qdiscs that would require a ops->reset() here?
875 * The qdisc was never in action so it shouldn't be necessary.
876 */
7c64b9f3 877 qdisc_put_stab(sch->stab);
23bcf634
PM
878 if (ops->destroy)
879 ops->destroy(sch);
880 goto err_out3;
1da177e4
LT
881}
882
1e90474c 883static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1da177e4 884{
175f9c1b
JK
885 struct qdisc_size_table *stab = NULL;
886 int err = 0;
1da177e4 887
175f9c1b 888 if (tca[TCA_OPTIONS]) {
1da177e4
LT
889 if (sch->ops->change == NULL)
890 return -EINVAL;
1e90474c 891 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1da177e4
LT
892 if (err)
893 return err;
894 }
175f9c1b
JK
895
896 if (tca[TCA_STAB]) {
897 stab = qdisc_get_stab(tca[TCA_STAB]);
898 if (IS_ERR(stab))
899 return PTR_ERR(stab);
900 }
901
902 qdisc_put_stab(sch->stab);
903 sch->stab = stab;
904
23bcf634 905 if (tca[TCA_RATE]) {
71bcb09a
SH
906 /* NB: ignores errors from replace_estimator
907 because change can't be undone. */
23bcf634
PM
908 if (sch->flags & TCQ_F_MQROOT)
909 goto out;
1da177e4 910 gen_replace_estimator(&sch->bstats, &sch->rate_est,
71bcb09a
SH
911 qdisc_root_sleeping_lock(sch),
912 tca[TCA_RATE]);
23bcf634
PM
913 }
914out:
1da177e4
LT
915 return 0;
916}
917
918struct check_loop_arg
919{
920 struct qdisc_walker w;
921 struct Qdisc *p;
922 int depth;
923};
924
925static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
926
927static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
928{
929 struct check_loop_arg arg;
930
931 if (q->ops->cl_ops == NULL)
932 return 0;
933
934 arg.w.stop = arg.w.skip = arg.w.count = 0;
935 arg.w.fn = check_loop_fn;
936 arg.depth = depth;
937 arg.p = p;
938 q->ops->cl_ops->walk(q, &arg.w);
939 return arg.w.stop ? -ELOOP : 0;
940}
941
942static int
943check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
944{
945 struct Qdisc *leaf;
20fea08b 946 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1da177e4
LT
947 struct check_loop_arg *arg = (struct check_loop_arg *)w;
948
949 leaf = cops->leaf(q, cl);
950 if (leaf) {
951 if (leaf == arg->p || arg->depth > 7)
952 return -ELOOP;
953 return check_loop(leaf, arg->p, arg->depth + 1);
954 }
955 return 0;
956}
957
958/*
959 * Delete/get qdisc.
960 */
961
962static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
963{
3b1e0a65 964 struct net *net = sock_net(skb->sk);
1da177e4 965 struct tcmsg *tcm = NLMSG_DATA(n);
1e90474c 966 struct nlattr *tca[TCA_MAX + 1];
1da177e4
LT
967 struct net_device *dev;
968 u32 clid = tcm->tcm_parent;
969 struct Qdisc *q = NULL;
970 struct Qdisc *p = NULL;
971 int err;
972
7316ae88 973 if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
974 return -ENODEV;
975
1e90474c
PM
976 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
977 if (err < 0)
978 return err;
979
1da177e4
LT
980 if (clid) {
981 if (clid != TC_H_ROOT) {
982 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
983 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
984 return -ENOENT;
985 q = qdisc_leaf(p, clid);
986 } else { /* ingress */
24824a09
ED
987 if (dev_ingress_queue(dev))
988 q = dev_ingress_queue(dev)->qdisc_sleeping;
10297b99 989 }
1da177e4 990 } else {
af356afa 991 q = dev->qdisc;
1da177e4
LT
992 }
993 if (!q)
994 return -ENOENT;
995
996 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
997 return -EINVAL;
998 } else {
999 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1000 return -ENOENT;
1001 }
1002
1e90474c 1003 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1da177e4
LT
1004 return -EINVAL;
1005
1006 if (n->nlmsg_type == RTM_DELQDISC) {
1007 if (!clid)
1008 return -EINVAL;
1009 if (q->handle == 0)
1010 return -ENOENT;
99194cff 1011 if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
1da177e4 1012 return err;
1da177e4 1013 } else {
7316ae88 1014 qdisc_notify(net, skb, n, clid, NULL, q);
1da177e4
LT
1015 }
1016 return 0;
1017}
1018
1019/*
1020 Create/change qdisc.
1021 */
1022
1023static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1024{
3b1e0a65 1025 struct net *net = sock_net(skb->sk);
1da177e4 1026 struct tcmsg *tcm;
1e90474c 1027 struct nlattr *tca[TCA_MAX + 1];
1da177e4
LT
1028 struct net_device *dev;
1029 u32 clid;
1030 struct Qdisc *q, *p;
1031 int err;
1032
1033replay:
1034 /* Reinit, just in case something touches this. */
1035 tcm = NLMSG_DATA(n);
1da177e4
LT
1036 clid = tcm->tcm_parent;
1037 q = p = NULL;
1038
7316ae88 1039 if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
1040 return -ENODEV;
1041
1e90474c
PM
1042 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1043 if (err < 0)
1044 return err;
1045
1da177e4
LT
1046 if (clid) {
1047 if (clid != TC_H_ROOT) {
1048 if (clid != TC_H_INGRESS) {
1049 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1050 return -ENOENT;
1051 q = qdisc_leaf(p, clid);
24824a09
ED
1052 } else { /* ingress */
1053 if (dev_ingress_queue_create(dev))
1054 q = dev_ingress_queue(dev)->qdisc_sleeping;
1da177e4
LT
1055 }
1056 } else {
af356afa 1057 q = dev->qdisc;
1da177e4
LT
1058 }
1059
1060 /* It may be default qdisc, ignore it */
1061 if (q && q->handle == 0)
1062 q = NULL;
1063
1064 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1065 if (tcm->tcm_handle) {
1066 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1067 return -EEXIST;
1068 if (TC_H_MIN(tcm->tcm_handle))
1069 return -EINVAL;
1070 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1071 goto create_n_graft;
1072 if (n->nlmsg_flags&NLM_F_EXCL)
1073 return -EEXIST;
1e90474c 1074 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1da177e4
LT
1075 return -EINVAL;
1076 if (q == p ||
1077 (p && check_loop(q, p, 0)))
1078 return -ELOOP;
1079 atomic_inc(&q->refcnt);
1080 goto graft;
1081 } else {
1082 if (q == NULL)
1083 goto create_n_graft;
1084
1085 /* This magic test requires explanation.
1086 *
1087 * We know, that some child q is already
1088 * attached to this parent and have choice:
1089 * either to change it or to create/graft new one.
1090 *
1091 * 1. We are allowed to create/graft only
1092 * if CREATE and REPLACE flags are set.
1093 *
1094 * 2. If EXCL is set, requestor wanted to say,
1095 * that qdisc tcm_handle is not expected
1096 * to exist, so that we choose create/graft too.
1097 *
1098 * 3. The last case is when no flags are set.
1099 * Alas, it is sort of hole in API, we
1100 * cannot decide what to do unambiguously.
1101 * For now we select create/graft, if
1102 * user gave KIND, which does not match existing.
1103 */
1104 if ((n->nlmsg_flags&NLM_F_CREATE) &&
1105 (n->nlmsg_flags&NLM_F_REPLACE) &&
1106 ((n->nlmsg_flags&NLM_F_EXCL) ||
1e90474c
PM
1107 (tca[TCA_KIND] &&
1108 nla_strcmp(tca[TCA_KIND], q->ops->id))))
1da177e4
LT
1109 goto create_n_graft;
1110 }
1111 }
1112 } else {
1113 if (!tcm->tcm_handle)
1114 return -EINVAL;
1115 q = qdisc_lookup(dev, tcm->tcm_handle);
1116 }
1117
1118 /* Change qdisc parameters */
1119 if (q == NULL)
1120 return -ENOENT;
1121 if (n->nlmsg_flags&NLM_F_EXCL)
1122 return -EEXIST;
1e90474c 1123 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1da177e4
LT
1124 return -EINVAL;
1125 err = qdisc_change(q, tca);
1126 if (err == 0)
7316ae88 1127 qdisc_notify(net, skb, n, clid, NULL, q);
1da177e4
LT
1128 return err;
1129
1130create_n_graft:
1131 if (!(n->nlmsg_flags&NLM_F_CREATE))
1132 return -ENOENT;
24824a09
ED
1133 if (clid == TC_H_INGRESS) {
1134 if (dev_ingress_queue(dev))
1135 q = qdisc_create(dev, dev_ingress_queue(dev), p,
1136 tcm->tcm_parent, tcm->tcm_parent,
1137 tca, &err);
1138 else
1139 err = -ENOENT;
1140 } else {
926e61b7 1141 struct netdev_queue *dev_queue;
6ec1c69a
DM
1142
1143 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
926e61b7
JP
1144 dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1145 else if (p)
1146 dev_queue = p->dev_queue;
1147 else
1148 dev_queue = netdev_get_tx_queue(dev, 0);
6ec1c69a 1149
926e61b7 1150 q = qdisc_create(dev, dev_queue, p,
bb949fbd 1151 tcm->tcm_parent, tcm->tcm_handle,
ffc8fefa 1152 tca, &err);
6ec1c69a 1153 }
1da177e4
LT
1154 if (q == NULL) {
1155 if (err == -EAGAIN)
1156 goto replay;
1157 return err;
1158 }
1159
1160graft:
e5befbd9
IJ
1161 err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1162 if (err) {
1163 if (q)
1164 qdisc_destroy(q);
1165 return err;
1da177e4 1166 }
e5befbd9 1167
1da177e4
LT
1168 return 0;
1169}
1170
1171static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
e431b8c0 1172 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
1173{
1174 struct tcmsg *tcm;
1175 struct nlmsghdr *nlh;
27a884dc 1176 unsigned char *b = skb_tail_pointer(skb);
1da177e4
LT
1177 struct gnet_dump d;
1178
e431b8c0 1179 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
1180 tcm = NLMSG_DATA(nlh);
1181 tcm->tcm_family = AF_UNSPEC;
9ef1d4c7
PM
1182 tcm->tcm__pad1 = 0;
1183 tcm->tcm__pad2 = 0;
5ce2d488 1184 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1da177e4
LT
1185 tcm->tcm_parent = clid;
1186 tcm->tcm_handle = q->handle;
1187 tcm->tcm_info = atomic_read(&q->refcnt);
57e1c487 1188 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1da177e4 1189 if (q->ops->dump && q->ops->dump(q, skb) < 0)
1e90474c 1190 goto nla_put_failure;
1da177e4
LT
1191 q->qstats.qlen = q->q.qlen;
1192
175f9c1b
JK
1193 if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1194 goto nla_put_failure;
1195
102396ae
JP
1196 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1197 qdisc_root_sleeping_lock(q), &d) < 0)
1e90474c 1198 goto nla_put_failure;
1da177e4
LT
1199
1200 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1e90474c 1201 goto nla_put_failure;
1da177e4
LT
1202
1203 if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
d250a5f9 1204 gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1da177e4 1205 gnet_stats_copy_queue(&d, &q->qstats) < 0)
1e90474c 1206 goto nla_put_failure;
10297b99 1207
1da177e4 1208 if (gnet_stats_finish_copy(&d) < 0)
1e90474c 1209 goto nla_put_failure;
10297b99 1210
27a884dc 1211 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
1212 return skb->len;
1213
1214nlmsg_failure:
1e90474c 1215nla_put_failure:
dc5fc579 1216 nlmsg_trim(skb, b);
1da177e4
LT
1217 return -1;
1218}
1219
53b0f080
ED
1220static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1221{
1222 return (q->flags & TCQ_F_BUILTIN) ? true : false;
1223}
1224
7316ae88
TG
1225static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1226 struct nlmsghdr *n, u32 clid,
1227 struct Qdisc *old, struct Qdisc *new)
1da177e4
LT
1228{
1229 struct sk_buff *skb;
1230 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1231
1232 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1233 if (!skb)
1234 return -ENOBUFS;
1235
53b0f080 1236 if (old && !tc_qdisc_dump_ignore(old)) {
1da177e4
LT
1237 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1238 goto err_out;
1239 }
53b0f080 1240 if (new && !tc_qdisc_dump_ignore(new)) {
1da177e4
LT
1241 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1242 goto err_out;
1243 }
1244
1245 if (skb->len)
7316ae88 1246 return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
1247
1248err_out:
1249 kfree_skb(skb);
1250 return -EINVAL;
1251}
1252
30723673
DM
1253static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1254 struct netlink_callback *cb,
1255 int *q_idx_p, int s_q_idx)
1256{
1257 int ret = 0, q_idx = *q_idx_p;
1258 struct Qdisc *q;
1259
1260 if (!root)
1261 return 0;
1262
1263 q = root;
1264 if (q_idx < s_q_idx) {
1265 q_idx++;
1266 } else {
1267 if (!tc_qdisc_dump_ignore(q) &&
1268 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1269 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1270 goto done;
1271 q_idx++;
1272 }
1273 list_for_each_entry(q, &root->list, list) {
1274 if (q_idx < s_q_idx) {
1275 q_idx++;
1276 continue;
1277 }
1278 if (!tc_qdisc_dump_ignore(q) &&
1279 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1280 cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1281 goto done;
1282 q_idx++;
1283 }
1284
1285out:
1286 *q_idx_p = q_idx;
1287 return ret;
1288done:
1289 ret = -1;
1290 goto out;
1291}
1292
1da177e4
LT
1293static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1294{
3b1e0a65 1295 struct net *net = sock_net(skb->sk);
1da177e4
LT
1296 int idx, q_idx;
1297 int s_idx, s_q_idx;
1298 struct net_device *dev;
1da177e4
LT
1299
1300 s_idx = cb->args[0];
1301 s_q_idx = q_idx = cb->args[1];
f1e9016d 1302
1303 rcu_read_lock();
7562f876 1304 idx = 0;
7316ae88 1305 for_each_netdev_rcu(net, dev) {
30723673
DM
1306 struct netdev_queue *dev_queue;
1307
1da177e4 1308 if (idx < s_idx)
7562f876 1309 goto cont;
1da177e4
LT
1310 if (idx > s_idx)
1311 s_q_idx = 0;
1da177e4 1312 q_idx = 0;
30723673 1313
af356afa 1314 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
30723673
DM
1315 goto done;
1316
24824a09
ED
1317 dev_queue = dev_ingress_queue(dev);
1318 if (dev_queue &&
1319 tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1320 &q_idx, s_q_idx) < 0)
30723673
DM
1321 goto done;
1322
7562f876
PE
1323cont:
1324 idx++;
1da177e4
LT
1325 }
1326
1327done:
f1e9016d 1328 rcu_read_unlock();
1da177e4
LT
1329
1330 cb->args[0] = idx;
1331 cb->args[1] = q_idx;
1332
1333 return skb->len;
1334}
1335
1336
1337
1338/************************************************
1339 * Traffic classes manipulation. *
1340 ************************************************/
1341
1342
1343
1344static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1345{
3b1e0a65 1346 struct net *net = sock_net(skb->sk);
1da177e4 1347 struct tcmsg *tcm = NLMSG_DATA(n);
1e90474c 1348 struct nlattr *tca[TCA_MAX + 1];
1da177e4
LT
1349 struct net_device *dev;
1350 struct Qdisc *q = NULL;
20fea08b 1351 const struct Qdisc_class_ops *cops;
1da177e4
LT
1352 unsigned long cl = 0;
1353 unsigned long new_cl;
1354 u32 pid = tcm->tcm_parent;
1355 u32 clid = tcm->tcm_handle;
1356 u32 qid = TC_H_MAJ(clid);
1357 int err;
1358
7316ae88 1359 if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
1360 return -ENODEV;
1361
1e90474c
PM
1362 err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1363 if (err < 0)
1364 return err;
1365
1da177e4
LT
1366 /*
1367 parent == TC_H_UNSPEC - unspecified parent.
1368 parent == TC_H_ROOT - class is root, which has no parent.
1369 parent == X:0 - parent is root class.
1370 parent == X:Y - parent is a node in hierarchy.
1371 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
1372
1373 handle == 0:0 - generate handle from kernel pool.
1374 handle == 0:Y - class is X:Y, where X:0 is qdisc.
1375 handle == X:Y - clear.
1376 handle == X:0 - root class.
1377 */
1378
1379 /* Step 1. Determine qdisc handle X:0 */
1380
1381 if (pid != TC_H_ROOT) {
1382 u32 qid1 = TC_H_MAJ(pid);
1383
1384 if (qid && qid1) {
1385 /* If both majors are known, they must be identical. */
1386 if (qid != qid1)
1387 return -EINVAL;
1388 } else if (qid1) {
1389 qid = qid1;
1390 } else if (qid == 0)
af356afa 1391 qid = dev->qdisc->handle;
1da177e4
LT
1392
1393 /* Now qid is genuine qdisc handle consistent
1394 both with parent and child.
1395
1396 TC_H_MAJ(pid) still may be unspecified, complete it now.
1397 */
1398 if (pid)
1399 pid = TC_H_MAKE(qid, pid);
1400 } else {
1401 if (qid == 0)
af356afa 1402 qid = dev->qdisc->handle;
1da177e4
LT
1403 }
1404
1405 /* OK. Locate qdisc */
10297b99 1406 if ((q = qdisc_lookup(dev, qid)) == NULL)
1da177e4
LT
1407 return -ENOENT;
1408
1409 /* An check that it supports classes */
1410 cops = q->ops->cl_ops;
1411 if (cops == NULL)
1412 return -EINVAL;
1413
1414 /* Now try to get class */
1415 if (clid == 0) {
1416 if (pid == TC_H_ROOT)
1417 clid = qid;
1418 } else
1419 clid = TC_H_MAKE(qid, clid);
1420
1421 if (clid)
1422 cl = cops->get(q, clid);
1423
1424 if (cl == 0) {
1425 err = -ENOENT;
1426 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1427 goto out;
1428 } else {
1429 switch (n->nlmsg_type) {
10297b99 1430 case RTM_NEWTCLASS:
1da177e4
LT
1431 err = -EEXIST;
1432 if (n->nlmsg_flags&NLM_F_EXCL)
1433 goto out;
1434 break;
1435 case RTM_DELTCLASS:
de6d5cdf
PM
1436 err = -EOPNOTSUPP;
1437 if (cops->delete)
1438 err = cops->delete(q, cl);
1da177e4 1439 if (err == 0)
7316ae88 1440 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1da177e4
LT
1441 goto out;
1442 case RTM_GETTCLASS:
7316ae88 1443 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1da177e4
LT
1444 goto out;
1445 default:
1446 err = -EINVAL;
1447 goto out;
1448 }
1449 }
1450
1451 new_cl = cl;
de6d5cdf
PM
1452 err = -EOPNOTSUPP;
1453 if (cops->change)
1454 err = cops->change(q, clid, pid, tca, &new_cl);
1da177e4 1455 if (err == 0)
7316ae88 1456 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1da177e4
LT
1457
1458out:
1459 if (cl)
1460 cops->put(q, cl);
1461
1462 return err;
1463}
1464
1465
1466static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1467 unsigned long cl,
e431b8c0 1468 u32 pid, u32 seq, u16 flags, int event)
1da177e4
LT
1469{
1470 struct tcmsg *tcm;
1471 struct nlmsghdr *nlh;
27a884dc 1472 unsigned char *b = skb_tail_pointer(skb);
1da177e4 1473 struct gnet_dump d;
20fea08b 1474 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1da177e4 1475
e431b8c0 1476 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1da177e4
LT
1477 tcm = NLMSG_DATA(nlh);
1478 tcm->tcm_family = AF_UNSPEC;
16ebb5e0
ED
1479 tcm->tcm__pad1 = 0;
1480 tcm->tcm__pad2 = 0;
5ce2d488 1481 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1da177e4
LT
1482 tcm->tcm_parent = q->handle;
1483 tcm->tcm_handle = q->handle;
1484 tcm->tcm_info = 0;
57e1c487 1485 NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1da177e4 1486 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1e90474c 1487 goto nla_put_failure;
1da177e4 1488
102396ae
JP
1489 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1490 qdisc_root_sleeping_lock(q), &d) < 0)
1e90474c 1491 goto nla_put_failure;
1da177e4
LT
1492
1493 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1e90474c 1494 goto nla_put_failure;
1da177e4
LT
1495
1496 if (gnet_stats_finish_copy(&d) < 0)
1e90474c 1497 goto nla_put_failure;
1da177e4 1498
27a884dc 1499 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1da177e4
LT
1500 return skb->len;
1501
1502nlmsg_failure:
1e90474c 1503nla_put_failure:
dc5fc579 1504 nlmsg_trim(skb, b);
1da177e4
LT
1505 return -1;
1506}
1507
7316ae88
TG
1508static int tclass_notify(struct net *net, struct sk_buff *oskb,
1509 struct nlmsghdr *n, struct Qdisc *q,
1510 unsigned long cl, int event)
1da177e4
LT
1511{
1512 struct sk_buff *skb;
1513 u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1514
1515 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1516 if (!skb)
1517 return -ENOBUFS;
1518
1519 if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1520 kfree_skb(skb);
1521 return -EINVAL;
1522 }
1523
7316ae88 1524 return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1da177e4
LT
1525}
1526
1527struct qdisc_dump_args
1528{
1529 struct qdisc_walker w;
1530 struct sk_buff *skb;
1531 struct netlink_callback *cb;
1532};
1533
1534static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1535{
1536 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1537
1538 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1539 a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1540}
1541
30723673
DM
1542static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1543 struct tcmsg *tcm, struct netlink_callback *cb,
1544 int *t_p, int s_t)
1545{
1546 struct qdisc_dump_args arg;
1547
1548 if (tc_qdisc_dump_ignore(q) ||
1549 *t_p < s_t || !q->ops->cl_ops ||
1550 (tcm->tcm_parent &&
1551 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1552 (*t_p)++;
1553 return 0;
1554 }
1555 if (*t_p > s_t)
1556 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1557 arg.w.fn = qdisc_class_dump;
1558 arg.skb = skb;
1559 arg.cb = cb;
1560 arg.w.stop = 0;
1561 arg.w.skip = cb->args[1];
1562 arg.w.count = 0;
1563 q->ops->cl_ops->walk(q, &arg.w);
1564 cb->args[1] = arg.w.count;
1565 if (arg.w.stop)
1566 return -1;
1567 (*t_p)++;
1568 return 0;
1569}
1570
1571static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1572 struct tcmsg *tcm, struct netlink_callback *cb,
1573 int *t_p, int s_t)
1574{
1575 struct Qdisc *q;
1576
1577 if (!root)
1578 return 0;
1579
1580 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1581 return -1;
1582
1583 list_for_each_entry(q, &root->list, list) {
1584 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1585 return -1;
1586 }
1587
1588 return 0;
1589}
1590
1da177e4
LT
1591static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1592{
30723673 1593 struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
3b1e0a65 1594 struct net *net = sock_net(skb->sk);
30723673 1595 struct netdev_queue *dev_queue;
1da177e4 1596 struct net_device *dev;
30723673 1597 int t, s_t;
1da177e4
LT
1598
1599 if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1600 return 0;
7316ae88 1601 if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1da177e4
LT
1602 return 0;
1603
1604 s_t = cb->args[0];
1605 t = 0;
1606
af356afa 1607 if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
30723673
DM
1608 goto done;
1609
24824a09
ED
1610 dev_queue = dev_ingress_queue(dev);
1611 if (dev_queue &&
1612 tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1613 &t, s_t) < 0)
30723673 1614 goto done;
1da177e4 1615
30723673 1616done:
1da177e4
LT
1617 cb->args[0] = t;
1618
1619 dev_put(dev);
1620 return skb->len;
1621}
1622
1623/* Main classifier routine: scans classifier chain attached
1624 to this qdisc, (optionally) tests for protocol and asks
1625 specific classifiers.
1626 */
73ca4918
PM
1627int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1628 struct tcf_result *res)
1629{
1630 __be16 protocol = skb->protocol;
1631 int err = 0;
1632
1633 for (; tp; tp = tp->next) {
1634 if ((tp->protocol == protocol ||
1635 tp->protocol == htons(ETH_P_ALL)) &&
1636 (err = tp->classify(skb, tp, res)) >= 0) {
1637#ifdef CONFIG_NET_CLS_ACT
1638 if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1639 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1640#endif
1641 return err;
1642 }
1643 }
1644 return -1;
1645}
1646EXPORT_SYMBOL(tc_classify_compat);
1647
1da177e4 1648int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
73ca4918 1649 struct tcf_result *res)
1da177e4
LT
1650{
1651 int err = 0;
73ca4918 1652 __be16 protocol;
1da177e4
LT
1653#ifdef CONFIG_NET_CLS_ACT
1654 struct tcf_proto *otp = tp;
1655reclassify:
1656#endif
1657 protocol = skb->protocol;
1658
73ca4918 1659 err = tc_classify_compat(skb, tp, res);
1da177e4 1660#ifdef CONFIG_NET_CLS_ACT
73ca4918
PM
1661 if (err == TC_ACT_RECLASSIFY) {
1662 u32 verd = G_TC_VERD(skb->tc_verd);
1663 tp = otp;
1664
1665 if (verd++ >= MAX_REC_LOOP) {
b60b6592 1666 if (net_ratelimit())
1667 printk(KERN_NOTICE
1668 "%s: packet reclassify loop"
1669 " rule prio %u protocol %02x\n",
1670 tp->q->ops->id,
1671 tp->prio & 0xffff, ntohs(tp->protocol));
73ca4918 1672 return TC_ACT_SHOT;
1da177e4 1673 }
73ca4918
PM
1674 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1675 goto reclassify;
1da177e4 1676 }
73ca4918
PM
1677#endif
1678 return err;
1da177e4 1679}
73ca4918 1680EXPORT_SYMBOL(tc_classify);
1da177e4 1681
a48b5a61
PM
1682void tcf_destroy(struct tcf_proto *tp)
1683{
1684 tp->ops->destroy(tp);
1685 module_put(tp->ops->owner);
1686 kfree(tp);
1687}
1688
ff31ab56 1689void tcf_destroy_chain(struct tcf_proto **fl)
a48b5a61
PM
1690{
1691 struct tcf_proto *tp;
1692
ff31ab56
PM
1693 while ((tp = *fl) != NULL) {
1694 *fl = tp->next;
a48b5a61
PM
1695 tcf_destroy(tp);
1696 }
1697}
1698EXPORT_SYMBOL(tcf_destroy_chain);
1699
1da177e4
LT
1700#ifdef CONFIG_PROC_FS
1701static int psched_show(struct seq_file *seq, void *v)
1702{
3c0cfc13
PM
1703 struct timespec ts;
1704
1705 hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1da177e4 1706 seq_printf(seq, "%08x %08x %08x %08x\n",
ca44d6e6 1707 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
514bca32 1708 1000000,
3c0cfc13 1709 (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1da177e4
LT
1710
1711 return 0;
1712}
1713
1714static int psched_open(struct inode *inode, struct file *file)
1715{
7e5ab157 1716 return single_open(file, psched_show, NULL);
1da177e4
LT
1717}
1718
da7071d7 1719static const struct file_operations psched_fops = {
1da177e4
LT
1720 .owner = THIS_MODULE,
1721 .open = psched_open,
1722 .read = seq_read,
1723 .llseek = seq_lseek,
1724 .release = single_release,
10297b99 1725};
7316ae88
TG
1726
1727static int __net_init psched_net_init(struct net *net)
1728{
1729 struct proc_dir_entry *e;
1730
1731 e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1732 if (e == NULL)
1733 return -ENOMEM;
1734
1735 return 0;
1736}
1737
1738static void __net_exit psched_net_exit(struct net *net)
1739{
1740 proc_net_remove(net, "psched");
7316ae88
TG
1741}
1742#else
1743static int __net_init psched_net_init(struct net *net)
1744{
1745 return 0;
1746}
1747
1748static void __net_exit psched_net_exit(struct net *net)
1749{
1750}
1da177e4
LT
1751#endif
1752
7316ae88
TG
1753static struct pernet_operations psched_net_ops = {
1754 .init = psched_net_init,
1755 .exit = psched_net_exit,
1756};
1757
1da177e4
LT
1758static int __init pktsched_init(void)
1759{
7316ae88
TG
1760 int err;
1761
1762 err = register_pernet_subsys(&psched_net_ops);
1763 if (err) {
1764 printk(KERN_ERR "pktsched_init: "
1765 "cannot initialize per netns operations\n");
1766 return err;
1767 }
1768
1da177e4
LT
1769 register_qdisc(&pfifo_qdisc_ops);
1770 register_qdisc(&bfifo_qdisc_ops);
57dbb2d8 1771 register_qdisc(&pfifo_head_drop_qdisc_ops);
6ec1c69a 1772 register_qdisc(&mq_qdisc_ops);
1da177e4 1773
be577ddc
TG
1774 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1775 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1776 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1777 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1778 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1779 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1780
1da177e4
LT
1781 return 0;
1782}
1783
1784subsys_initcall(pktsched_init);