]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/sched/sch_api.c
xps: Transmit Packet Steering
[net-next-2.6.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32
33 #include <net/net_namespace.h>
34 #include <net/sock.h>
35 #include <net/netlink.h>
36 #include <net/pkt_sched.h>
37
38 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
39                         struct nlmsghdr *n, u32 clid,
40                         struct Qdisc *old, struct Qdisc *new);
41 static int tclass_notify(struct net *net, struct sk_buff *oskb,
42                          struct nlmsghdr *n, struct Qdisc *q,
43                          unsigned long cl, int event);
44
45 /*
46
47    Short review.
48    -------------
49
50    This file consists of two interrelated parts:
51
52    1. queueing disciplines manager frontend.
53    2. traffic classes manager frontend.
54
55    Generally, queueing discipline ("qdisc") is a black box,
56    which is able to enqueue packets and to dequeue them (when
57    device is ready to send something) in order and at times
58    determined by algorithm hidden in it.
59
60    qdisc's are divided to two categories:
61    - "queues", which have no internal structure visible from outside.
62    - "schedulers", which split all the packets to "traffic classes",
63      using "packet classifiers" (look at cls_api.c)
64
65    In turn, classes may have child qdiscs (as rule, queues)
66    attached to them etc. etc. etc.
67
68    The goal of the routines in this file is to translate
69    information supplied by user in the form of handles
70    to more intelligible for kernel form, to make some sanity
71    checks and part of work, which is common to all qdiscs
72    and to provide rtnetlink notifications.
73
74    All real intelligent work is done inside qdisc modules.
75
76
77
78    Every discipline has two major routines: enqueue and dequeue.
79
80    ---dequeue
81
82    dequeue usually returns a skb to send. It is allowed to return NULL,
83    but it does not mean that queue is empty, it just means that
84    discipline does not want to send anything this time.
85    Queue is really empty if q->q.qlen == 0.
86    For complicated disciplines with multiple queues q->q is not
87    real packet queue, but however q->q.qlen must be valid.
88
89    ---enqueue
90
91    enqueue returns 0, if packet was enqueued successfully.
92    If packet (this one or another one) was dropped, it returns
93    not zero error code.
94    NET_XMIT_DROP        - this packet dropped
95      Expected action: do not backoff, but wait until queue will clear.
96    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
97      Expected action: backoff or ignore
98    NET_XMIT_POLICED     - dropped by police.
99      Expected action: backoff or error to real-time apps.
100
101    Auxiliary routines:
102
103    ---peek
104
105    like dequeue but without removing a packet from the queue
106
107    ---reset
108
109    returns qdisc to initial state: purge all buffers, clear all
110    timers, counters (except for statistics) etc.
111
112    ---init
113
114    initializes newly created qdisc.
115
116    ---destroy
117
118    destroys resources allocated by init and during lifetime of qdisc.
119
120    ---change
121
122    changes qdisc parameters.
123  */
124
125 /* Protects list of registered TC modules. It is pure SMP lock. */
126 static DEFINE_RWLOCK(qdisc_mod_lock);
127
128
129 /************************************************
130  *      Queueing disciplines manipulation.      *
131  ************************************************/
132
133
134 /* The list of all installed queueing disciplines. */
135
136 static struct Qdisc_ops *qdisc_base;
137
138 /* Register/uregister queueing discipline */
139
140 int register_qdisc(struct Qdisc_ops *qops)
141 {
142         struct Qdisc_ops *q, **qp;
143         int rc = -EEXIST;
144
145         write_lock(&qdisc_mod_lock);
146         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
147                 if (!strcmp(qops->id, q->id))
148                         goto out;
149
150         if (qops->enqueue == NULL)
151                 qops->enqueue = noop_qdisc_ops.enqueue;
152         if (qops->peek == NULL) {
153                 if (qops->dequeue == NULL)
154                         qops->peek = noop_qdisc_ops.peek;
155                 else
156                         goto out_einval;
157         }
158         if (qops->dequeue == NULL)
159                 qops->dequeue = noop_qdisc_ops.dequeue;
160
161         if (qops->cl_ops) {
162                 const struct Qdisc_class_ops *cops = qops->cl_ops;
163
164                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
165                         goto out_einval;
166
167                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
168                         goto out_einval;
169         }
170
171         qops->next = NULL;
172         *qp = qops;
173         rc = 0;
174 out:
175         write_unlock(&qdisc_mod_lock);
176         return rc;
177
178 out_einval:
179         rc = -EINVAL;
180         goto out;
181 }
182 EXPORT_SYMBOL(register_qdisc);
183
184 int unregister_qdisc(struct Qdisc_ops *qops)
185 {
186         struct Qdisc_ops *q, **qp;
187         int err = -ENOENT;
188
189         write_lock(&qdisc_mod_lock);
190         for (qp = &qdisc_base; (q=*qp)!=NULL; qp = &q->next)
191                 if (q == qops)
192                         break;
193         if (q) {
194                 *qp = q->next;
195                 q->next = NULL;
196                 err = 0;
197         }
198         write_unlock(&qdisc_mod_lock);
199         return err;
200 }
201 EXPORT_SYMBOL(unregister_qdisc);
202
203 /* We know handle. Find qdisc among all qdisc's attached to device
204    (root qdisc, all its children, children of children etc.)
205  */
206
207 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
208 {
209         struct Qdisc *q;
210
211         if (!(root->flags & TCQ_F_BUILTIN) &&
212             root->handle == handle)
213                 return root;
214
215         list_for_each_entry(q, &root->list, list) {
216                 if (q->handle == handle)
217                         return q;
218         }
219         return NULL;
220 }
221
222 static void qdisc_list_add(struct Qdisc *q)
223 {
224         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
225                 list_add_tail(&q->list, &qdisc_dev(q)->qdisc->list);
226 }
227
228 void qdisc_list_del(struct Qdisc *q)
229 {
230         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
231                 list_del(&q->list);
232 }
233 EXPORT_SYMBOL(qdisc_list_del);
234
235 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
236 {
237         struct Qdisc *q;
238
239         q = qdisc_match_from_root(dev->qdisc, handle);
240         if (q)
241                 goto out;
242
243         if (dev_ingress_queue(dev))
244                 q = qdisc_match_from_root(
245                         dev_ingress_queue(dev)->qdisc_sleeping,
246                         handle);
247 out:
248         return q;
249 }
250
251 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
252 {
253         unsigned long cl;
254         struct Qdisc *leaf;
255         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
256
257         if (cops == NULL)
258                 return NULL;
259         cl = cops->get(p, classid);
260
261         if (cl == 0)
262                 return NULL;
263         leaf = cops->leaf(p, cl);
264         cops->put(p, cl);
265         return leaf;
266 }
267
268 /* Find queueing discipline by name */
269
270 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
271 {
272         struct Qdisc_ops *q = NULL;
273
274         if (kind) {
275                 read_lock(&qdisc_mod_lock);
276                 for (q = qdisc_base; q; q = q->next) {
277                         if (nla_strcmp(kind, q->id) == 0) {
278                                 if (!try_module_get(q->owner))
279                                         q = NULL;
280                                 break;
281                         }
282                 }
283                 read_unlock(&qdisc_mod_lock);
284         }
285         return q;
286 }
287
288 static struct qdisc_rate_table *qdisc_rtab_list;
289
290 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, struct nlattr *tab)
291 {
292         struct qdisc_rate_table *rtab;
293
294         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
295                 if (memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) == 0) {
296                         rtab->refcnt++;
297                         return rtab;
298                 }
299         }
300
301         if (tab == NULL || r->rate == 0 || r->cell_log == 0 ||
302             nla_len(tab) != TC_RTAB_SIZE)
303                 return NULL;
304
305         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
306         if (rtab) {
307                 rtab->rate = *r;
308                 rtab->refcnt = 1;
309                 memcpy(rtab->data, nla_data(tab), 1024);
310                 rtab->next = qdisc_rtab_list;
311                 qdisc_rtab_list = rtab;
312         }
313         return rtab;
314 }
315 EXPORT_SYMBOL(qdisc_get_rtab);
316
317 void qdisc_put_rtab(struct qdisc_rate_table *tab)
318 {
319         struct qdisc_rate_table *rtab, **rtabp;
320
321         if (!tab || --tab->refcnt)
322                 return;
323
324         for (rtabp = &qdisc_rtab_list; (rtab=*rtabp) != NULL; rtabp = &rtab->next) {
325                 if (rtab == tab) {
326                         *rtabp = rtab->next;
327                         kfree(rtab);
328                         return;
329                 }
330         }
331 }
332 EXPORT_SYMBOL(qdisc_put_rtab);
333
334 static LIST_HEAD(qdisc_stab_list);
335 static DEFINE_SPINLOCK(qdisc_stab_lock);
336
337 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
338         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
339         [TCA_STAB_DATA] = { .type = NLA_BINARY },
340 };
341
342 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
343 {
344         struct nlattr *tb[TCA_STAB_MAX + 1];
345         struct qdisc_size_table *stab;
346         struct tc_sizespec *s;
347         unsigned int tsize = 0;
348         u16 *tab = NULL;
349         int err;
350
351         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
352         if (err < 0)
353                 return ERR_PTR(err);
354         if (!tb[TCA_STAB_BASE])
355                 return ERR_PTR(-EINVAL);
356
357         s = nla_data(tb[TCA_STAB_BASE]);
358
359         if (s->tsize > 0) {
360                 if (!tb[TCA_STAB_DATA])
361                         return ERR_PTR(-EINVAL);
362                 tab = nla_data(tb[TCA_STAB_DATA]);
363                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
364         }
365
366         if (tsize != s->tsize || (!tab && tsize > 0))
367                 return ERR_PTR(-EINVAL);
368
369         spin_lock(&qdisc_stab_lock);
370
371         list_for_each_entry(stab, &qdisc_stab_list, list) {
372                 if (memcmp(&stab->szopts, s, sizeof(*s)))
373                         continue;
374                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
375                         continue;
376                 stab->refcnt++;
377                 spin_unlock(&qdisc_stab_lock);
378                 return stab;
379         }
380
381         spin_unlock(&qdisc_stab_lock);
382
383         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
384         if (!stab)
385                 return ERR_PTR(-ENOMEM);
386
387         stab->refcnt = 1;
388         stab->szopts = *s;
389         if (tsize > 0)
390                 memcpy(stab->data, tab, tsize * sizeof(u16));
391
392         spin_lock(&qdisc_stab_lock);
393         list_add_tail(&stab->list, &qdisc_stab_list);
394         spin_unlock(&qdisc_stab_lock);
395
396         return stab;
397 }
398
399 void qdisc_put_stab(struct qdisc_size_table *tab)
400 {
401         if (!tab)
402                 return;
403
404         spin_lock(&qdisc_stab_lock);
405
406         if (--tab->refcnt == 0) {
407                 list_del(&tab->list);
408                 kfree(tab);
409         }
410
411         spin_unlock(&qdisc_stab_lock);
412 }
413 EXPORT_SYMBOL(qdisc_put_stab);
414
415 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
416 {
417         struct nlattr *nest;
418
419         nest = nla_nest_start(skb, TCA_STAB);
420         if (nest == NULL)
421                 goto nla_put_failure;
422         NLA_PUT(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts);
423         nla_nest_end(skb, nest);
424
425         return skb->len;
426
427 nla_put_failure:
428         return -1;
429 }
430
431 void qdisc_calculate_pkt_len(struct sk_buff *skb, struct qdisc_size_table *stab)
432 {
433         int pkt_len, slot;
434
435         pkt_len = skb->len + stab->szopts.overhead;
436         if (unlikely(!stab->szopts.tsize))
437                 goto out;
438
439         slot = pkt_len + stab->szopts.cell_align;
440         if (unlikely(slot < 0))
441                 slot = 0;
442
443         slot >>= stab->szopts.cell_log;
444         if (likely(slot < stab->szopts.tsize))
445                 pkt_len = stab->data[slot];
446         else
447                 pkt_len = stab->data[stab->szopts.tsize - 1] *
448                                 (slot / stab->szopts.tsize) +
449                                 stab->data[slot % stab->szopts.tsize];
450
451         pkt_len <<= stab->szopts.size_log;
452 out:
453         if (unlikely(pkt_len < 1))
454                 pkt_len = 1;
455         qdisc_skb_cb(skb)->pkt_len = pkt_len;
456 }
457 EXPORT_SYMBOL(qdisc_calculate_pkt_len);
458
459 void qdisc_warn_nonwc(char *txt, struct Qdisc *qdisc)
460 {
461         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
462                 printk(KERN_WARNING
463                        "%s: %s qdisc %X: is non-work-conserving?\n",
464                        txt, qdisc->ops->id, qdisc->handle >> 16);
465                 qdisc->flags |= TCQ_F_WARN_NONWC;
466         }
467 }
468 EXPORT_SYMBOL(qdisc_warn_nonwc);
469
470 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
471 {
472         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
473                                                  timer);
474
475         wd->qdisc->flags &= ~TCQ_F_THROTTLED;
476         __netif_schedule(qdisc_root(wd->qdisc));
477
478         return HRTIMER_NORESTART;
479 }
480
481 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
482 {
483         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
484         wd->timer.function = qdisc_watchdog;
485         wd->qdisc = qdisc;
486 }
487 EXPORT_SYMBOL(qdisc_watchdog_init);
488
489 void qdisc_watchdog_schedule(struct qdisc_watchdog *wd, psched_time_t expires)
490 {
491         ktime_t time;
492
493         if (test_bit(__QDISC_STATE_DEACTIVATED,
494                      &qdisc_root_sleeping(wd->qdisc)->state))
495                 return;
496
497         wd->qdisc->flags |= TCQ_F_THROTTLED;
498         time = ktime_set(0, 0);
499         time = ktime_add_ns(time, PSCHED_TICKS2NS(expires));
500         hrtimer_start(&wd->timer, time, HRTIMER_MODE_ABS);
501 }
502 EXPORT_SYMBOL(qdisc_watchdog_schedule);
503
504 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
505 {
506         hrtimer_cancel(&wd->timer);
507         wd->qdisc->flags &= ~TCQ_F_THROTTLED;
508 }
509 EXPORT_SYMBOL(qdisc_watchdog_cancel);
510
511 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
512 {
513         unsigned int size = n * sizeof(struct hlist_head), i;
514         struct hlist_head *h;
515
516         if (size <= PAGE_SIZE)
517                 h = kmalloc(size, GFP_KERNEL);
518         else
519                 h = (struct hlist_head *)
520                         __get_free_pages(GFP_KERNEL, get_order(size));
521
522         if (h != NULL) {
523                 for (i = 0; i < n; i++)
524                         INIT_HLIST_HEAD(&h[i]);
525         }
526         return h;
527 }
528
529 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
530 {
531         unsigned int size = n * sizeof(struct hlist_head);
532
533         if (size <= PAGE_SIZE)
534                 kfree(h);
535         else
536                 free_pages((unsigned long)h, get_order(size));
537 }
538
539 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
540 {
541         struct Qdisc_class_common *cl;
542         struct hlist_node *n, *next;
543         struct hlist_head *nhash, *ohash;
544         unsigned int nsize, nmask, osize;
545         unsigned int i, h;
546
547         /* Rehash when load factor exceeds 0.75 */
548         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
549                 return;
550         nsize = clhash->hashsize * 2;
551         nmask = nsize - 1;
552         nhash = qdisc_class_hash_alloc(nsize);
553         if (nhash == NULL)
554                 return;
555
556         ohash = clhash->hash;
557         osize = clhash->hashsize;
558
559         sch_tree_lock(sch);
560         for (i = 0; i < osize; i++) {
561                 hlist_for_each_entry_safe(cl, n, next, &ohash[i], hnode) {
562                         h = qdisc_class_hash(cl->classid, nmask);
563                         hlist_add_head(&cl->hnode, &nhash[h]);
564                 }
565         }
566         clhash->hash     = nhash;
567         clhash->hashsize = nsize;
568         clhash->hashmask = nmask;
569         sch_tree_unlock(sch);
570
571         qdisc_class_hash_free(ohash, osize);
572 }
573 EXPORT_SYMBOL(qdisc_class_hash_grow);
574
575 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
576 {
577         unsigned int size = 4;
578
579         clhash->hash = qdisc_class_hash_alloc(size);
580         if (clhash->hash == NULL)
581                 return -ENOMEM;
582         clhash->hashsize  = size;
583         clhash->hashmask  = size - 1;
584         clhash->hashelems = 0;
585         return 0;
586 }
587 EXPORT_SYMBOL(qdisc_class_hash_init);
588
589 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
590 {
591         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
592 }
593 EXPORT_SYMBOL(qdisc_class_hash_destroy);
594
595 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
596                              struct Qdisc_class_common *cl)
597 {
598         unsigned int h;
599
600         INIT_HLIST_NODE(&cl->hnode);
601         h = qdisc_class_hash(cl->classid, clhash->hashmask);
602         hlist_add_head(&cl->hnode, &clhash->hash[h]);
603         clhash->hashelems++;
604 }
605 EXPORT_SYMBOL(qdisc_class_hash_insert);
606
607 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
608                              struct Qdisc_class_common *cl)
609 {
610         hlist_del(&cl->hnode);
611         clhash->hashelems--;
612 }
613 EXPORT_SYMBOL(qdisc_class_hash_remove);
614
615 /* Allocate an unique handle from space managed by kernel */
616
617 static u32 qdisc_alloc_handle(struct net_device *dev)
618 {
619         int i = 0x10000;
620         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
621
622         do {
623                 autohandle += TC_H_MAKE(0x10000U, 0);
624                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
625                         autohandle = TC_H_MAKE(0x80000000U, 0);
626         } while (qdisc_lookup(dev, autohandle) && --i > 0);
627
628         return i>0 ? autohandle : 0;
629 }
630
631 void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
632 {
633         const struct Qdisc_class_ops *cops;
634         unsigned long cl;
635         u32 parentid;
636
637         if (n == 0)
638                 return;
639         while ((parentid = sch->parent)) {
640                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
641                         return;
642
643                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
644                 if (sch == NULL) {
645                         WARN_ON(parentid != TC_H_ROOT);
646                         return;
647                 }
648                 cops = sch->ops->cl_ops;
649                 if (cops->qlen_notify) {
650                         cl = cops->get(sch, parentid);
651                         cops->qlen_notify(sch, cl);
652                         cops->put(sch, cl);
653                 }
654                 sch->q.qlen -= n;
655         }
656 }
657 EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
658
659 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
660                                struct nlmsghdr *n, u32 clid,
661                                struct Qdisc *old, struct Qdisc *new)
662 {
663         if (new || old)
664                 qdisc_notify(net, skb, n, clid, old, new);
665
666         if (old)
667                 qdisc_destroy(old);
668 }
669
670 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
671  * to device "dev".
672  *
673  * When appropriate send a netlink notification using 'skb'
674  * and "n".
675  *
676  * On success, destroy old qdisc.
677  */
678
679 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
680                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
681                        struct Qdisc *new, struct Qdisc *old)
682 {
683         struct Qdisc *q = old;
684         struct net *net = dev_net(dev);
685         int err = 0;
686
687         if (parent == NULL) {
688                 unsigned int i, num_q, ingress;
689
690                 ingress = 0;
691                 num_q = dev->num_tx_queues;
692                 if ((q && q->flags & TCQ_F_INGRESS) ||
693                     (new && new->flags & TCQ_F_INGRESS)) {
694                         num_q = 1;
695                         ingress = 1;
696                         if (!dev_ingress_queue(dev))
697                                 return -ENOENT;
698                 }
699
700                 if (dev->flags & IFF_UP)
701                         dev_deactivate(dev);
702
703                 if (new && new->ops->attach) {
704                         new->ops->attach(new);
705                         num_q = 0;
706                 }
707
708                 for (i = 0; i < num_q; i++) {
709                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
710
711                         if (!ingress)
712                                 dev_queue = netdev_get_tx_queue(dev, i);
713
714                         old = dev_graft_qdisc(dev_queue, new);
715                         if (new && i > 0)
716                                 atomic_inc(&new->refcnt);
717
718                         if (!ingress)
719                                 qdisc_destroy(old);
720                 }
721
722                 if (!ingress) {
723                         notify_and_destroy(net, skb, n, classid,
724                                            dev->qdisc, new);
725                         if (new && !new->ops->attach)
726                                 atomic_inc(&new->refcnt);
727                         dev->qdisc = new ? : &noop_qdisc;
728                 } else {
729                         notify_and_destroy(net, skb, n, classid, old, new);
730                 }
731
732                 if (dev->flags & IFF_UP)
733                         dev_activate(dev);
734         } else {
735                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
736
737                 err = -EOPNOTSUPP;
738                 if (cops && cops->graft) {
739                         unsigned long cl = cops->get(parent, classid);
740                         if (cl) {
741                                 err = cops->graft(parent, cl, new, &old);
742                                 cops->put(parent, cl);
743                         } else
744                                 err = -ENOENT;
745                 }
746                 if (!err)
747                         notify_and_destroy(net, skb, n, classid, old, new);
748         }
749         return err;
750 }
751
752 /* lockdep annotation is needed for ingress; egress gets it only for name */
753 static struct lock_class_key qdisc_tx_lock;
754 static struct lock_class_key qdisc_rx_lock;
755
756 /*
757    Allocate and initialize new qdisc.
758
759    Parameters are passed via opt.
760  */
761
762 static struct Qdisc *
763 qdisc_create(struct net_device *dev, struct netdev_queue *dev_queue,
764              struct Qdisc *p, u32 parent, u32 handle,
765              struct nlattr **tca, int *errp)
766 {
767         int err;
768         struct nlattr *kind = tca[TCA_KIND];
769         struct Qdisc *sch;
770         struct Qdisc_ops *ops;
771         struct qdisc_size_table *stab;
772
773         ops = qdisc_lookup_ops(kind);
774 #ifdef CONFIG_MODULES
775         if (ops == NULL && kind != NULL) {
776                 char name[IFNAMSIZ];
777                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
778                         /* We dropped the RTNL semaphore in order to
779                          * perform the module load.  So, even if we
780                          * succeeded in loading the module we have to
781                          * tell the caller to replay the request.  We
782                          * indicate this using -EAGAIN.
783                          * We replay the request because the device may
784                          * go away in the mean time.
785                          */
786                         rtnl_unlock();
787                         request_module("sch_%s", name);
788                         rtnl_lock();
789                         ops = qdisc_lookup_ops(kind);
790                         if (ops != NULL) {
791                                 /* We will try again qdisc_lookup_ops,
792                                  * so don't keep a reference.
793                                  */
794                                 module_put(ops->owner);
795                                 err = -EAGAIN;
796                                 goto err_out;
797                         }
798                 }
799         }
800 #endif
801
802         err = -ENOENT;
803         if (ops == NULL)
804                 goto err_out;
805
806         sch = qdisc_alloc(dev_queue, ops);
807         if (IS_ERR(sch)) {
808                 err = PTR_ERR(sch);
809                 goto err_out2;
810         }
811
812         sch->parent = parent;
813
814         if (handle == TC_H_INGRESS) {
815                 sch->flags |= TCQ_F_INGRESS;
816                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
817                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
818         } else {
819                 if (handle == 0) {
820                         handle = qdisc_alloc_handle(dev);
821                         err = -ENOMEM;
822                         if (handle == 0)
823                                 goto err_out3;
824                 }
825                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
826         }
827
828         sch->handle = handle;
829
830         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
831                 if (tca[TCA_STAB]) {
832                         stab = qdisc_get_stab(tca[TCA_STAB]);
833                         if (IS_ERR(stab)) {
834                                 err = PTR_ERR(stab);
835                                 goto err_out4;
836                         }
837                         sch->stab = stab;
838                 }
839                 if (tca[TCA_RATE]) {
840                         spinlock_t *root_lock;
841
842                         err = -EOPNOTSUPP;
843                         if (sch->flags & TCQ_F_MQROOT)
844                                 goto err_out4;
845
846                         if ((sch->parent != TC_H_ROOT) &&
847                             !(sch->flags & TCQ_F_INGRESS) &&
848                             (!p || !(p->flags & TCQ_F_MQROOT)))
849                                 root_lock = qdisc_root_sleeping_lock(sch);
850                         else
851                                 root_lock = qdisc_lock(sch);
852
853                         err = gen_new_estimator(&sch->bstats, &sch->rate_est,
854                                                 root_lock, tca[TCA_RATE]);
855                         if (err)
856                                 goto err_out4;
857                 }
858
859                 qdisc_list_add(sch);
860
861                 return sch;
862         }
863 err_out3:
864         dev_put(dev);
865         kfree((char *) sch - sch->padded);
866 err_out2:
867         module_put(ops->owner);
868 err_out:
869         *errp = err;
870         return NULL;
871
872 err_out4:
873         /*
874          * Any broken qdiscs that would require a ops->reset() here?
875          * The qdisc was never in action so it shouldn't be necessary.
876          */
877         qdisc_put_stab(sch->stab);
878         if (ops->destroy)
879                 ops->destroy(sch);
880         goto err_out3;
881 }
882
883 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
884 {
885         struct qdisc_size_table *stab = NULL;
886         int err = 0;
887
888         if (tca[TCA_OPTIONS]) {
889                 if (sch->ops->change == NULL)
890                         return -EINVAL;
891                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
892                 if (err)
893                         return err;
894         }
895
896         if (tca[TCA_STAB]) {
897                 stab = qdisc_get_stab(tca[TCA_STAB]);
898                 if (IS_ERR(stab))
899                         return PTR_ERR(stab);
900         }
901
902         qdisc_put_stab(sch->stab);
903         sch->stab = stab;
904
905         if (tca[TCA_RATE]) {
906                 /* NB: ignores errors from replace_estimator
907                    because change can't be undone. */
908                 if (sch->flags & TCQ_F_MQROOT)
909                         goto out;
910                 gen_replace_estimator(&sch->bstats, &sch->rate_est,
911                                             qdisc_root_sleeping_lock(sch),
912                                             tca[TCA_RATE]);
913         }
914 out:
915         return 0;
916 }
917
918 struct check_loop_arg
919 {
920         struct qdisc_walker     w;
921         struct Qdisc            *p;
922         int                     depth;
923 };
924
925 static int check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w);
926
927 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
928 {
929         struct check_loop_arg   arg;
930
931         if (q->ops->cl_ops == NULL)
932                 return 0;
933
934         arg.w.stop = arg.w.skip = arg.w.count = 0;
935         arg.w.fn = check_loop_fn;
936         arg.depth = depth;
937         arg.p = p;
938         q->ops->cl_ops->walk(q, &arg.w);
939         return arg.w.stop ? -ELOOP : 0;
940 }
941
942 static int
943 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
944 {
945         struct Qdisc *leaf;
946         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
947         struct check_loop_arg *arg = (struct check_loop_arg *)w;
948
949         leaf = cops->leaf(q, cl);
950         if (leaf) {
951                 if (leaf == arg->p || arg->depth > 7)
952                         return -ELOOP;
953                 return check_loop(leaf, arg->p, arg->depth + 1);
954         }
955         return 0;
956 }
957
958 /*
959  * Delete/get qdisc.
960  */
961
962 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
963 {
964         struct net *net = sock_net(skb->sk);
965         struct tcmsg *tcm = NLMSG_DATA(n);
966         struct nlattr *tca[TCA_MAX + 1];
967         struct net_device *dev;
968         u32 clid = tcm->tcm_parent;
969         struct Qdisc *q = NULL;
970         struct Qdisc *p = NULL;
971         int err;
972
973         if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
974                 return -ENODEV;
975
976         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
977         if (err < 0)
978                 return err;
979
980         if (clid) {
981                 if (clid != TC_H_ROOT) {
982                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
983                                 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
984                                         return -ENOENT;
985                                 q = qdisc_leaf(p, clid);
986                         } else { /* ingress */
987                                 if (dev_ingress_queue(dev))
988                                         q = dev_ingress_queue(dev)->qdisc_sleeping;
989                         }
990                 } else {
991                         q = dev->qdisc;
992                 }
993                 if (!q)
994                         return -ENOENT;
995
996                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
997                         return -EINVAL;
998         } else {
999                 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1000                         return -ENOENT;
1001         }
1002
1003         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1004                 return -EINVAL;
1005
1006         if (n->nlmsg_type == RTM_DELQDISC) {
1007                 if (!clid)
1008                         return -EINVAL;
1009                 if (q->handle == 0)
1010                         return -ENOENT;
1011                 if ((err = qdisc_graft(dev, p, skb, n, clid, NULL, q)) != 0)
1012                         return err;
1013         } else {
1014                 qdisc_notify(net, skb, n, clid, NULL, q);
1015         }
1016         return 0;
1017 }
1018
1019 /*
1020    Create/change qdisc.
1021  */
1022
1023 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1024 {
1025         struct net *net = sock_net(skb->sk);
1026         struct tcmsg *tcm;
1027         struct nlattr *tca[TCA_MAX + 1];
1028         struct net_device *dev;
1029         u32 clid;
1030         struct Qdisc *q, *p;
1031         int err;
1032
1033 replay:
1034         /* Reinit, just in case something touches this. */
1035         tcm = NLMSG_DATA(n);
1036         clid = tcm->tcm_parent;
1037         q = p = NULL;
1038
1039         if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1040                 return -ENODEV;
1041
1042         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1043         if (err < 0)
1044                 return err;
1045
1046         if (clid) {
1047                 if (clid != TC_H_ROOT) {
1048                         if (clid != TC_H_INGRESS) {
1049                                 if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL)
1050                                         return -ENOENT;
1051                                 q = qdisc_leaf(p, clid);
1052                         } else { /* ingress */
1053                                 if (dev_ingress_queue_create(dev))
1054                                         q = dev_ingress_queue(dev)->qdisc_sleeping;
1055                         }
1056                 } else {
1057                         q = dev->qdisc;
1058                 }
1059
1060                 /* It may be default qdisc, ignore it */
1061                 if (q && q->handle == 0)
1062                         q = NULL;
1063
1064                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1065                         if (tcm->tcm_handle) {
1066                                 if (q && !(n->nlmsg_flags&NLM_F_REPLACE))
1067                                         return -EEXIST;
1068                                 if (TC_H_MIN(tcm->tcm_handle))
1069                                         return -EINVAL;
1070                                 if ((q = qdisc_lookup(dev, tcm->tcm_handle)) == NULL)
1071                                         goto create_n_graft;
1072                                 if (n->nlmsg_flags&NLM_F_EXCL)
1073                                         return -EEXIST;
1074                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1075                                         return -EINVAL;
1076                                 if (q == p ||
1077                                     (p && check_loop(q, p, 0)))
1078                                         return -ELOOP;
1079                                 atomic_inc(&q->refcnt);
1080                                 goto graft;
1081                         } else {
1082                                 if (q == NULL)
1083                                         goto create_n_graft;
1084
1085                                 /* This magic test requires explanation.
1086                                  *
1087                                  *   We know, that some child q is already
1088                                  *   attached to this parent and have choice:
1089                                  *   either to change it or to create/graft new one.
1090                                  *
1091                                  *   1. We are allowed to create/graft only
1092                                  *   if CREATE and REPLACE flags are set.
1093                                  *
1094                                  *   2. If EXCL is set, requestor wanted to say,
1095                                  *   that qdisc tcm_handle is not expected
1096                                  *   to exist, so that we choose create/graft too.
1097                                  *
1098                                  *   3. The last case is when no flags are set.
1099                                  *   Alas, it is sort of hole in API, we
1100                                  *   cannot decide what to do unambiguously.
1101                                  *   For now we select create/graft, if
1102                                  *   user gave KIND, which does not match existing.
1103                                  */
1104                                 if ((n->nlmsg_flags&NLM_F_CREATE) &&
1105                                     (n->nlmsg_flags&NLM_F_REPLACE) &&
1106                                     ((n->nlmsg_flags&NLM_F_EXCL) ||
1107                                      (tca[TCA_KIND] &&
1108                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1109                                         goto create_n_graft;
1110                         }
1111                 }
1112         } else {
1113                 if (!tcm->tcm_handle)
1114                         return -EINVAL;
1115                 q = qdisc_lookup(dev, tcm->tcm_handle);
1116         }
1117
1118         /* Change qdisc parameters */
1119         if (q == NULL)
1120                 return -ENOENT;
1121         if (n->nlmsg_flags&NLM_F_EXCL)
1122                 return -EEXIST;
1123         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1124                 return -EINVAL;
1125         err = qdisc_change(q, tca);
1126         if (err == 0)
1127                 qdisc_notify(net, skb, n, clid, NULL, q);
1128         return err;
1129
1130 create_n_graft:
1131         if (!(n->nlmsg_flags&NLM_F_CREATE))
1132                 return -ENOENT;
1133         if (clid == TC_H_INGRESS) {
1134                 if (dev_ingress_queue(dev))
1135                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1136                                          tcm->tcm_parent, tcm->tcm_parent,
1137                                          tca, &err);
1138                 else
1139                         err = -ENOENT;
1140         } else {
1141                 struct netdev_queue *dev_queue;
1142
1143                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1144                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1145                 else if (p)
1146                         dev_queue = p->dev_queue;
1147                 else
1148                         dev_queue = netdev_get_tx_queue(dev, 0);
1149
1150                 q = qdisc_create(dev, dev_queue, p,
1151                                  tcm->tcm_parent, tcm->tcm_handle,
1152                                  tca, &err);
1153         }
1154         if (q == NULL) {
1155                 if (err == -EAGAIN)
1156                         goto replay;
1157                 return err;
1158         }
1159
1160 graft:
1161         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1162         if (err) {
1163                 if (q)
1164                         qdisc_destroy(q);
1165                 return err;
1166         }
1167
1168         return 0;
1169 }
1170
1171 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1172                          u32 pid, u32 seq, u16 flags, int event)
1173 {
1174         struct tcmsg *tcm;
1175         struct nlmsghdr  *nlh;
1176         unsigned char *b = skb_tail_pointer(skb);
1177         struct gnet_dump d;
1178
1179         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1180         tcm = NLMSG_DATA(nlh);
1181         tcm->tcm_family = AF_UNSPEC;
1182         tcm->tcm__pad1 = 0;
1183         tcm->tcm__pad2 = 0;
1184         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1185         tcm->tcm_parent = clid;
1186         tcm->tcm_handle = q->handle;
1187         tcm->tcm_info = atomic_read(&q->refcnt);
1188         NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1189         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1190                 goto nla_put_failure;
1191         q->qstats.qlen = q->q.qlen;
1192
1193         if (q->stab && qdisc_dump_stab(skb, q->stab) < 0)
1194                 goto nla_put_failure;
1195
1196         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1197                                          qdisc_root_sleeping_lock(q), &d) < 0)
1198                 goto nla_put_failure;
1199
1200         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1201                 goto nla_put_failure;
1202
1203         if (gnet_stats_copy_basic(&d, &q->bstats) < 0 ||
1204             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1205             gnet_stats_copy_queue(&d, &q->qstats) < 0)
1206                 goto nla_put_failure;
1207
1208         if (gnet_stats_finish_copy(&d) < 0)
1209                 goto nla_put_failure;
1210
1211         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1212         return skb->len;
1213
1214 nlmsg_failure:
1215 nla_put_failure:
1216         nlmsg_trim(skb, b);
1217         return -1;
1218 }
1219
1220 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1221 {
1222         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1223 }
1224
1225 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1226                         struct nlmsghdr *n, u32 clid,
1227                         struct Qdisc *old, struct Qdisc *new)
1228 {
1229         struct sk_buff *skb;
1230         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1231
1232         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1233         if (!skb)
1234                 return -ENOBUFS;
1235
1236         if (old && !tc_qdisc_dump_ignore(old)) {
1237                 if (tc_fill_qdisc(skb, old, clid, pid, n->nlmsg_seq, 0, RTM_DELQDISC) < 0)
1238                         goto err_out;
1239         }
1240         if (new && !tc_qdisc_dump_ignore(new)) {
1241                 if (tc_fill_qdisc(skb, new, clid, pid, n->nlmsg_seq, old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1242                         goto err_out;
1243         }
1244
1245         if (skb->len)
1246                 return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1247
1248 err_out:
1249         kfree_skb(skb);
1250         return -EINVAL;
1251 }
1252
1253 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1254                               struct netlink_callback *cb,
1255                               int *q_idx_p, int s_q_idx)
1256 {
1257         int ret = 0, q_idx = *q_idx_p;
1258         struct Qdisc *q;
1259
1260         if (!root)
1261                 return 0;
1262
1263         q = root;
1264         if (q_idx < s_q_idx) {
1265                 q_idx++;
1266         } else {
1267                 if (!tc_qdisc_dump_ignore(q) &&
1268                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1269                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1270                         goto done;
1271                 q_idx++;
1272         }
1273         list_for_each_entry(q, &root->list, list) {
1274                 if (q_idx < s_q_idx) {
1275                         q_idx++;
1276                         continue;
1277                 }
1278                 if (!tc_qdisc_dump_ignore(q) && 
1279                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid,
1280                                   cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0)
1281                         goto done;
1282                 q_idx++;
1283         }
1284
1285 out:
1286         *q_idx_p = q_idx;
1287         return ret;
1288 done:
1289         ret = -1;
1290         goto out;
1291 }
1292
1293 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1294 {
1295         struct net *net = sock_net(skb->sk);
1296         int idx, q_idx;
1297         int s_idx, s_q_idx;
1298         struct net_device *dev;
1299
1300         s_idx = cb->args[0];
1301         s_q_idx = q_idx = cb->args[1];
1302
1303         rcu_read_lock();
1304         idx = 0;
1305         for_each_netdev_rcu(net, dev) {
1306                 struct netdev_queue *dev_queue;
1307
1308                 if (idx < s_idx)
1309                         goto cont;
1310                 if (idx > s_idx)
1311                         s_q_idx = 0;
1312                 q_idx = 0;
1313
1314                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0)
1315                         goto done;
1316
1317                 dev_queue = dev_ingress_queue(dev);
1318                 if (dev_queue &&
1319                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1320                                        &q_idx, s_q_idx) < 0)
1321                         goto done;
1322
1323 cont:
1324                 idx++;
1325         }
1326
1327 done:
1328         rcu_read_unlock();
1329
1330         cb->args[0] = idx;
1331         cb->args[1] = q_idx;
1332
1333         return skb->len;
1334 }
1335
1336
1337
1338 /************************************************
1339  *      Traffic classes manipulation.           *
1340  ************************************************/
1341
1342
1343
1344 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n, void *arg)
1345 {
1346         struct net *net = sock_net(skb->sk);
1347         struct tcmsg *tcm = NLMSG_DATA(n);
1348         struct nlattr *tca[TCA_MAX + 1];
1349         struct net_device *dev;
1350         struct Qdisc *q = NULL;
1351         const struct Qdisc_class_ops *cops;
1352         unsigned long cl = 0;
1353         unsigned long new_cl;
1354         u32 pid = tcm->tcm_parent;
1355         u32 clid = tcm->tcm_handle;
1356         u32 qid = TC_H_MAJ(clid);
1357         int err;
1358
1359         if ((dev = __dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1360                 return -ENODEV;
1361
1362         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1363         if (err < 0)
1364                 return err;
1365
1366         /*
1367            parent == TC_H_UNSPEC - unspecified parent.
1368            parent == TC_H_ROOT   - class is root, which has no parent.
1369            parent == X:0         - parent is root class.
1370            parent == X:Y         - parent is a node in hierarchy.
1371            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1372
1373            handle == 0:0         - generate handle from kernel pool.
1374            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1375            handle == X:Y         - clear.
1376            handle == X:0         - root class.
1377          */
1378
1379         /* Step 1. Determine qdisc handle X:0 */
1380
1381         if (pid != TC_H_ROOT) {
1382                 u32 qid1 = TC_H_MAJ(pid);
1383
1384                 if (qid && qid1) {
1385                         /* If both majors are known, they must be identical. */
1386                         if (qid != qid1)
1387                                 return -EINVAL;
1388                 } else if (qid1) {
1389                         qid = qid1;
1390                 } else if (qid == 0)
1391                         qid = dev->qdisc->handle;
1392
1393                 /* Now qid is genuine qdisc handle consistent
1394                    both with parent and child.
1395
1396                    TC_H_MAJ(pid) still may be unspecified, complete it now.
1397                  */
1398                 if (pid)
1399                         pid = TC_H_MAKE(qid, pid);
1400         } else {
1401                 if (qid == 0)
1402                         qid = dev->qdisc->handle;
1403         }
1404
1405         /* OK. Locate qdisc */
1406         if ((q = qdisc_lookup(dev, qid)) == NULL)
1407                 return -ENOENT;
1408
1409         /* An check that it supports classes */
1410         cops = q->ops->cl_ops;
1411         if (cops == NULL)
1412                 return -EINVAL;
1413
1414         /* Now try to get class */
1415         if (clid == 0) {
1416                 if (pid == TC_H_ROOT)
1417                         clid = qid;
1418         } else
1419                 clid = TC_H_MAKE(qid, clid);
1420
1421         if (clid)
1422                 cl = cops->get(q, clid);
1423
1424         if (cl == 0) {
1425                 err = -ENOENT;
1426                 if (n->nlmsg_type != RTM_NEWTCLASS || !(n->nlmsg_flags&NLM_F_CREATE))
1427                         goto out;
1428         } else {
1429                 switch (n->nlmsg_type) {
1430                 case RTM_NEWTCLASS:
1431                         err = -EEXIST;
1432                         if (n->nlmsg_flags&NLM_F_EXCL)
1433                                 goto out;
1434                         break;
1435                 case RTM_DELTCLASS:
1436                         err = -EOPNOTSUPP;
1437                         if (cops->delete)
1438                                 err = cops->delete(q, cl);
1439                         if (err == 0)
1440                                 tclass_notify(net, skb, n, q, cl, RTM_DELTCLASS);
1441                         goto out;
1442                 case RTM_GETTCLASS:
1443                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1444                         goto out;
1445                 default:
1446                         err = -EINVAL;
1447                         goto out;
1448                 }
1449         }
1450
1451         new_cl = cl;
1452         err = -EOPNOTSUPP;
1453         if (cops->change)
1454                 err = cops->change(q, clid, pid, tca, &new_cl);
1455         if (err == 0)
1456                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1457
1458 out:
1459         if (cl)
1460                 cops->put(q, cl);
1461
1462         return err;
1463 }
1464
1465
1466 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1467                           unsigned long cl,
1468                           u32 pid, u32 seq, u16 flags, int event)
1469 {
1470         struct tcmsg *tcm;
1471         struct nlmsghdr  *nlh;
1472         unsigned char *b = skb_tail_pointer(skb);
1473         struct gnet_dump d;
1474         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1475
1476         nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*tcm), flags);
1477         tcm = NLMSG_DATA(nlh);
1478         tcm->tcm_family = AF_UNSPEC;
1479         tcm->tcm__pad1 = 0;
1480         tcm->tcm__pad2 = 0;
1481         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1482         tcm->tcm_parent = q->handle;
1483         tcm->tcm_handle = q->handle;
1484         tcm->tcm_info = 0;
1485         NLA_PUT_STRING(skb, TCA_KIND, q->ops->id);
1486         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1487                 goto nla_put_failure;
1488
1489         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1490                                          qdisc_root_sleeping_lock(q), &d) < 0)
1491                 goto nla_put_failure;
1492
1493         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1494                 goto nla_put_failure;
1495
1496         if (gnet_stats_finish_copy(&d) < 0)
1497                 goto nla_put_failure;
1498
1499         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1500         return skb->len;
1501
1502 nlmsg_failure:
1503 nla_put_failure:
1504         nlmsg_trim(skb, b);
1505         return -1;
1506 }
1507
1508 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1509                          struct nlmsghdr *n, struct Qdisc *q,
1510                          unsigned long cl, int event)
1511 {
1512         struct sk_buff *skb;
1513         u32 pid = oskb ? NETLINK_CB(oskb).pid : 0;
1514
1515         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1516         if (!skb)
1517                 return -ENOBUFS;
1518
1519         if (tc_fill_tclass(skb, q, cl, pid, n->nlmsg_seq, 0, event) < 0) {
1520                 kfree_skb(skb);
1521                 return -EINVAL;
1522         }
1523
1524         return rtnetlink_send(skb, net, pid, RTNLGRP_TC, n->nlmsg_flags&NLM_F_ECHO);
1525 }
1526
1527 struct qdisc_dump_args
1528 {
1529         struct qdisc_walker w;
1530         struct sk_buff *skb;
1531         struct netlink_callback *cb;
1532 };
1533
1534 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl, struct qdisc_walker *arg)
1535 {
1536         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1537
1538         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).pid,
1539                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWTCLASS);
1540 }
1541
1542 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1543                                 struct tcmsg *tcm, struct netlink_callback *cb,
1544                                 int *t_p, int s_t)
1545 {
1546         struct qdisc_dump_args arg;
1547
1548         if (tc_qdisc_dump_ignore(q) ||
1549             *t_p < s_t || !q->ops->cl_ops ||
1550             (tcm->tcm_parent &&
1551              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1552                 (*t_p)++;
1553                 return 0;
1554         }
1555         if (*t_p > s_t)
1556                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1557         arg.w.fn = qdisc_class_dump;
1558         arg.skb = skb;
1559         arg.cb = cb;
1560         arg.w.stop  = 0;
1561         arg.w.skip = cb->args[1];
1562         arg.w.count = 0;
1563         q->ops->cl_ops->walk(q, &arg.w);
1564         cb->args[1] = arg.w.count;
1565         if (arg.w.stop)
1566                 return -1;
1567         (*t_p)++;
1568         return 0;
1569 }
1570
1571 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1572                                struct tcmsg *tcm, struct netlink_callback *cb,
1573                                int *t_p, int s_t)
1574 {
1575         struct Qdisc *q;
1576
1577         if (!root)
1578                 return 0;
1579
1580         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1581                 return -1;
1582
1583         list_for_each_entry(q, &root->list, list) {
1584                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1585                         return -1;
1586         }
1587
1588         return 0;
1589 }
1590
1591 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1592 {
1593         struct tcmsg *tcm = (struct tcmsg*)NLMSG_DATA(cb->nlh);
1594         struct net *net = sock_net(skb->sk);
1595         struct netdev_queue *dev_queue;
1596         struct net_device *dev;
1597         int t, s_t;
1598
1599         if (cb->nlh->nlmsg_len < NLMSG_LENGTH(sizeof(*tcm)))
1600                 return 0;
1601         if ((dev = dev_get_by_index(net, tcm->tcm_ifindex)) == NULL)
1602                 return 0;
1603
1604         s_t = cb->args[0];
1605         t = 0;
1606
1607         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0)
1608                 goto done;
1609
1610         dev_queue = dev_ingress_queue(dev);
1611         if (dev_queue &&
1612             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1613                                 &t, s_t) < 0)
1614                 goto done;
1615
1616 done:
1617         cb->args[0] = t;
1618
1619         dev_put(dev);
1620         return skb->len;
1621 }
1622
1623 /* Main classifier routine: scans classifier chain attached
1624    to this qdisc, (optionally) tests for protocol and asks
1625    specific classifiers.
1626  */
1627 int tc_classify_compat(struct sk_buff *skb, struct tcf_proto *tp,
1628                        struct tcf_result *res)
1629 {
1630         __be16 protocol = skb->protocol;
1631         int err = 0;
1632
1633         for (; tp; tp = tp->next) {
1634                 if ((tp->protocol == protocol ||
1635                      tp->protocol == htons(ETH_P_ALL)) &&
1636                     (err = tp->classify(skb, tp, res)) >= 0) {
1637 #ifdef CONFIG_NET_CLS_ACT
1638                         if (err != TC_ACT_RECLASSIFY && skb->tc_verd)
1639                                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, 0);
1640 #endif
1641                         return err;
1642                 }
1643         }
1644         return -1;
1645 }
1646 EXPORT_SYMBOL(tc_classify_compat);
1647
1648 int tc_classify(struct sk_buff *skb, struct tcf_proto *tp,
1649                 struct tcf_result *res)
1650 {
1651         int err = 0;
1652         __be16 protocol;
1653 #ifdef CONFIG_NET_CLS_ACT
1654         struct tcf_proto *otp = tp;
1655 reclassify:
1656 #endif
1657         protocol = skb->protocol;
1658
1659         err = tc_classify_compat(skb, tp, res);
1660 #ifdef CONFIG_NET_CLS_ACT
1661         if (err == TC_ACT_RECLASSIFY) {
1662                 u32 verd = G_TC_VERD(skb->tc_verd);
1663                 tp = otp;
1664
1665                 if (verd++ >= MAX_REC_LOOP) {
1666                         if (net_ratelimit())
1667                                 printk(KERN_NOTICE
1668                                        "%s: packet reclassify loop"
1669                                           " rule prio %u protocol %02x\n",
1670                                        tp->q->ops->id,
1671                                        tp->prio & 0xffff, ntohs(tp->protocol));
1672                         return TC_ACT_SHOT;
1673                 }
1674                 skb->tc_verd = SET_TC_VERD(skb->tc_verd, verd);
1675                 goto reclassify;
1676         }
1677 #endif
1678         return err;
1679 }
1680 EXPORT_SYMBOL(tc_classify);
1681
1682 void tcf_destroy(struct tcf_proto *tp)
1683 {
1684         tp->ops->destroy(tp);
1685         module_put(tp->ops->owner);
1686         kfree(tp);
1687 }
1688
1689 void tcf_destroy_chain(struct tcf_proto **fl)
1690 {
1691         struct tcf_proto *tp;
1692
1693         while ((tp = *fl) != NULL) {
1694                 *fl = tp->next;
1695                 tcf_destroy(tp);
1696         }
1697 }
1698 EXPORT_SYMBOL(tcf_destroy_chain);
1699
1700 #ifdef CONFIG_PROC_FS
1701 static int psched_show(struct seq_file *seq, void *v)
1702 {
1703         struct timespec ts;
1704
1705         hrtimer_get_res(CLOCK_MONOTONIC, &ts);
1706         seq_printf(seq, "%08x %08x %08x %08x\n",
1707                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1708                    1000000,
1709                    (u32)NSEC_PER_SEC/(u32)ktime_to_ns(timespec_to_ktime(ts)));
1710
1711         return 0;
1712 }
1713
1714 static int psched_open(struct inode *inode, struct file *file)
1715 {
1716         return single_open(file, psched_show, NULL);
1717 }
1718
1719 static const struct file_operations psched_fops = {
1720         .owner = THIS_MODULE,
1721         .open = psched_open,
1722         .read  = seq_read,
1723         .llseek = seq_lseek,
1724         .release = single_release,
1725 };
1726
1727 static int __net_init psched_net_init(struct net *net)
1728 {
1729         struct proc_dir_entry *e;
1730
1731         e = proc_net_fops_create(net, "psched", 0, &psched_fops);
1732         if (e == NULL)
1733                 return -ENOMEM;
1734
1735         return 0;
1736 }
1737
1738 static void __net_exit psched_net_exit(struct net *net)
1739 {
1740         proc_net_remove(net, "psched");
1741 }
1742 #else
1743 static int __net_init psched_net_init(struct net *net)
1744 {
1745         return 0;
1746 }
1747
1748 static void __net_exit psched_net_exit(struct net *net)
1749 {
1750 }
1751 #endif
1752
1753 static struct pernet_operations psched_net_ops = {
1754         .init = psched_net_init,
1755         .exit = psched_net_exit,
1756 };
1757
1758 static int __init pktsched_init(void)
1759 {
1760         int err;
1761
1762         err = register_pernet_subsys(&psched_net_ops);
1763         if (err) {
1764                 printk(KERN_ERR "pktsched_init: "
1765                        "cannot initialize per netns operations\n");
1766                 return err;
1767         }
1768
1769         register_qdisc(&pfifo_qdisc_ops);
1770         register_qdisc(&bfifo_qdisc_ops);
1771         register_qdisc(&pfifo_head_drop_qdisc_ops);
1772         register_qdisc(&mq_qdisc_ops);
1773
1774         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
1775         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
1776         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
1777         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
1778         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
1779         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
1780
1781         return 0;
1782 }
1783
1784 subsys_initcall(pktsched_init);