]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/fib_trie.c
net: suppress lockdep-RCU false positive in FIB trie.
[net-next-2.6.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
19baf839
RO
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
fd966255
RO
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
19baf839
RO
49 */
50
80b71b80 51#define VERSION "0.409"
19baf839 52
19baf839
RO
53#include <asm/uaccess.h>
54#include <asm/system.h>
1977f032 55#include <linux/bitops.h>
19baf839
RO
56#include <linux/types.h>
57#include <linux/kernel.h>
19baf839
RO
58#include <linux/mm.h>
59#include <linux/string.h>
60#include <linux/socket.h>
61#include <linux/sockios.h>
62#include <linux/errno.h>
63#include <linux/in.h>
64#include <linux/inet.h>
cd8787ab 65#include <linux/inetdevice.h>
19baf839
RO
66#include <linux/netdevice.h>
67#include <linux/if_arp.h>
68#include <linux/proc_fs.h>
2373ce1c 69#include <linux/rcupdate.h>
19baf839
RO
70#include <linux/skbuff.h>
71#include <linux/netlink.h>
72#include <linux/init.h>
73#include <linux/list.h>
457c4cbc 74#include <net/net_namespace.h>
19baf839
RO
75#include <net/ip.h>
76#include <net/protocol.h>
77#include <net/route.h>
78#include <net/tcp.h>
79#include <net/sock.h>
80#include <net/ip_fib.h>
81#include "fib_lookup.h"
82
06ef921d 83#define MAX_STAT_DEPTH 32
19baf839 84
19baf839 85#define KEYLENGTH (8*sizeof(t_key))
19baf839 86
19baf839
RO
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL
2373ce1c
RO
92#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
93
91b9a277
OJ
94#define IS_TNODE(n) (!(n->parent & T_LEAF))
95#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839
RO
96
97struct node {
91b9a277 98 unsigned long parent;
8d965444 99 t_key key;
19baf839
RO
100};
101
102struct leaf {
91b9a277 103 unsigned long parent;
8d965444 104 t_key key;
19baf839 105 struct hlist_head list;
2373ce1c 106 struct rcu_head rcu;
19baf839
RO
107};
108
109struct leaf_info {
110 struct hlist_node hlist;
2373ce1c 111 struct rcu_head rcu;
19baf839
RO
112 int plen;
113 struct list_head falh;
114};
115
116struct tnode {
91b9a277 117 unsigned long parent;
8d965444 118 t_key key;
112d8cfc
ED
119 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
120 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
8d965444
ED
121 unsigned int full_children; /* KEYLENGTH bits needed */
122 unsigned int empty_children; /* KEYLENGTH bits needed */
15be75cd
SH
123 union {
124 struct rcu_head rcu;
125 struct work_struct work;
e0f7cb8c 126 struct tnode *tnode_free;
15be75cd 127 };
91b9a277 128 struct node *child[0];
19baf839
RO
129};
130
131#ifdef CONFIG_IP_FIB_TRIE_STATS
132struct trie_use_stats {
133 unsigned int gets;
134 unsigned int backtrack;
135 unsigned int semantic_match_passed;
136 unsigned int semantic_match_miss;
137 unsigned int null_node_hit;
2f36895a 138 unsigned int resize_node_skipped;
19baf839
RO
139};
140#endif
141
142struct trie_stat {
143 unsigned int totdepth;
144 unsigned int maxdepth;
145 unsigned int tnodes;
146 unsigned int leaves;
147 unsigned int nullpointers;
93672292 148 unsigned int prefixes;
06ef921d 149 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 150};
19baf839
RO
151
152struct trie {
91b9a277 153 struct node *trie;
19baf839
RO
154#ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
156#endif
19baf839
RO
157};
158
19baf839 159static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
a07f5f50
SH
160static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
161 int wasfull);
19baf839 162static struct node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
163static struct tnode *inflate(struct trie *t, struct tnode *tn);
164static struct tnode *halve(struct trie *t, struct tnode *tn);
e0f7cb8c
JP
165/* tnodes to free after resize(); protected by RTNL */
166static struct tnode *tnode_free_head;
c3059477
JP
167static size_t tnode_free_size;
168
169/*
170 * synchronize_rcu after call_rcu for that many pages; it should be especially
171 * useful before resizing the root node with PREEMPT_NONE configs; the value was
172 * obtained experimentally, aiming to avoid visible slowdown.
173 */
174static const int sync_pages = 128;
19baf839 175
e18b890b 176static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 177static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 178
06801916
SH
179static inline struct tnode *node_parent(struct node *node)
180{
b59cfbf7
ED
181 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
182}
183
184static inline struct tnode *node_parent_rcu(struct node *node)
185{
186 struct tnode *ret = node_parent(node);
06801916 187
06801916
SH
188 return rcu_dereference(ret);
189}
190
6440cc9e
SH
191/* Same as rcu_assign_pointer
192 * but that macro() assumes that value is a pointer.
193 */
06801916
SH
194static inline void node_set_parent(struct node *node, struct tnode *ptr)
195{
6440cc9e
SH
196 smp_wmb();
197 node->parent = (unsigned long)ptr | NODE_TYPE(node);
06801916 198}
2373ce1c 199
b59cfbf7
ED
200static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
201{
202 BUG_ON(i >= 1U << tn->bits);
2373ce1c 203
b59cfbf7
ED
204 return tn->child[i];
205}
206
207static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
19baf839 208{
b59cfbf7 209 struct node *ret = tnode_get_child(tn, i);
19baf839 210
b59cfbf7 211 return rcu_dereference(ret);
19baf839
RO
212}
213
bb435b8d 214static inline int tnode_child_length(const struct tnode *tn)
19baf839 215{
91b9a277 216 return 1 << tn->bits;
19baf839
RO
217}
218
ab66b4a7
SH
219static inline t_key mask_pfx(t_key k, unsigned short l)
220{
221 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
222}
223
19baf839
RO
224static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
225{
91b9a277 226 if (offset < KEYLENGTH)
19baf839 227 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 228 else
19baf839
RO
229 return 0;
230}
231
232static inline int tkey_equals(t_key a, t_key b)
233{
c877efb2 234 return a == b;
19baf839
RO
235}
236
237static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
238{
c877efb2
SH
239 if (bits == 0 || offset >= KEYLENGTH)
240 return 1;
91b9a277
OJ
241 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
242 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 243}
19baf839
RO
244
245static inline int tkey_mismatch(t_key a, int offset, t_key b)
246{
247 t_key diff = a ^ b;
248 int i = offset;
249
c877efb2
SH
250 if (!diff)
251 return 0;
252 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
253 i++;
254 return i;
255}
256
19baf839 257/*
e905a9ed
YH
258 To understand this stuff, an understanding of keys and all their bits is
259 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
260 all of the bits in that key are significant.
261
262 Consider a node 'n' and its parent 'tp'.
263
e905a9ed
YH
264 If n is a leaf, every bit in its key is significant. Its presence is
265 necessitated by path compression, since during a tree traversal (when
266 searching for a leaf - unless we are doing an insertion) we will completely
267 ignore all skipped bits we encounter. Thus we need to verify, at the end of
268 a potentially successful search, that we have indeed been walking the
19baf839
RO
269 correct key path.
270
e905a9ed
YH
271 Note that we can never "miss" the correct key in the tree if present by
272 following the wrong path. Path compression ensures that segments of the key
273 that are the same for all keys with a given prefix are skipped, but the
274 skipped part *is* identical for each node in the subtrie below the skipped
275 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
276 call to tkey_sub_equals() in trie_insert().
277
e905a9ed 278 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
279 have many different meanings.
280
e905a9ed 281 Example:
19baf839
RO
282 _________________________________________________________________
283 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
284 -----------------------------------------------------------------
e905a9ed 285 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
286
287 _________________________________________________________________
288 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
289 -----------------------------------------------------------------
290 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
291
292 tp->pos = 7
293 tp->bits = 3
294 n->pos = 15
91b9a277 295 n->bits = 4
19baf839 296
e905a9ed
YH
297 First, let's just ignore the bits that come before the parent tp, that is
298 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
299 not use them for anything.
300
301 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 302 index into the parent's child array. That is, they will be used to find
19baf839
RO
303 'n' among tp's children.
304
305 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
306 for the node n.
307
e905a9ed 308 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
309 of the bits are really not needed or indeed known in n->key.
310
e905a9ed 311 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 312 n's child array, and will of course be different for each child.
e905a9ed 313
c877efb2 314
19baf839
RO
315 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
316 at this point.
317
318*/
319
0c7770c7 320static inline void check_tnode(const struct tnode *tn)
19baf839 321{
0c7770c7 322 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
323}
324
f5026fab
DL
325static const int halve_threshold = 25;
326static const int inflate_threshold = 50;
345aa031 327static const int halve_threshold_root = 15;
80b71b80 328static const int inflate_threshold_root = 30;
2373ce1c
RO
329
330static void __alias_free_mem(struct rcu_head *head)
19baf839 331{
2373ce1c
RO
332 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
333 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
334}
335
2373ce1c 336static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 337{
2373ce1c
RO
338 call_rcu(&fa->rcu, __alias_free_mem);
339}
91b9a277 340
2373ce1c
RO
341static void __leaf_free_rcu(struct rcu_head *head)
342{
bc3c8c1e
SH
343 struct leaf *l = container_of(head, struct leaf, rcu);
344 kmem_cache_free(trie_leaf_kmem, l);
2373ce1c 345}
91b9a277 346
387a5487
SH
347static inline void free_leaf(struct leaf *l)
348{
349 call_rcu_bh(&l->rcu, __leaf_free_rcu);
350}
351
2373ce1c 352static void __leaf_info_free_rcu(struct rcu_head *head)
19baf839 353{
2373ce1c 354 kfree(container_of(head, struct leaf_info, rcu));
19baf839
RO
355}
356
2373ce1c 357static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 358{
2373ce1c 359 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
19baf839
RO
360}
361
8d965444 362static struct tnode *tnode_alloc(size_t size)
f0e36f8c 363{
2373ce1c 364 if (size <= PAGE_SIZE)
8d965444 365 return kzalloc(size, GFP_KERNEL);
15be75cd
SH
366 else
367 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
368}
2373ce1c 369
15be75cd
SH
370static void __tnode_vfree(struct work_struct *arg)
371{
372 struct tnode *tn = container_of(arg, struct tnode, work);
373 vfree(tn);
f0e36f8c
PM
374}
375
2373ce1c 376static void __tnode_free_rcu(struct rcu_head *head)
f0e36f8c 377{
2373ce1c 378 struct tnode *tn = container_of(head, struct tnode, rcu);
8d965444
ED
379 size_t size = sizeof(struct tnode) +
380 (sizeof(struct node *) << tn->bits);
f0e36f8c
PM
381
382 if (size <= PAGE_SIZE)
383 kfree(tn);
15be75cd
SH
384 else {
385 INIT_WORK(&tn->work, __tnode_vfree);
386 schedule_work(&tn->work);
387 }
f0e36f8c
PM
388}
389
2373ce1c
RO
390static inline void tnode_free(struct tnode *tn)
391{
387a5487
SH
392 if (IS_LEAF(tn))
393 free_leaf((struct leaf *) tn);
394 else
550e29bc 395 call_rcu(&tn->rcu, __tnode_free_rcu);
2373ce1c
RO
396}
397
e0f7cb8c
JP
398static void tnode_free_safe(struct tnode *tn)
399{
400 BUG_ON(IS_LEAF(tn));
7b85576d
JP
401 tn->tnode_free = tnode_free_head;
402 tnode_free_head = tn;
c3059477
JP
403 tnode_free_size += sizeof(struct tnode) +
404 (sizeof(struct node *) << tn->bits);
e0f7cb8c
JP
405}
406
407static void tnode_free_flush(void)
408{
409 struct tnode *tn;
410
411 while ((tn = tnode_free_head)) {
412 tnode_free_head = tn->tnode_free;
413 tn->tnode_free = NULL;
414 tnode_free(tn);
415 }
c3059477
JP
416
417 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
418 tnode_free_size = 0;
419 synchronize_rcu();
420 }
e0f7cb8c
JP
421}
422
2373ce1c
RO
423static struct leaf *leaf_new(void)
424{
bc3c8c1e 425 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c
RO
426 if (l) {
427 l->parent = T_LEAF;
428 INIT_HLIST_HEAD(&l->list);
429 }
430 return l;
431}
432
433static struct leaf_info *leaf_info_new(int plen)
434{
435 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
436 if (li) {
437 li->plen = plen;
438 INIT_LIST_HEAD(&li->falh);
439 }
440 return li;
441}
442
a07f5f50 443static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 444{
8d965444 445 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
f0e36f8c 446 struct tnode *tn = tnode_alloc(sz);
19baf839 447
91b9a277 448 if (tn) {
2373ce1c 449 tn->parent = T_TNODE;
19baf839
RO
450 tn->pos = pos;
451 tn->bits = bits;
452 tn->key = key;
453 tn->full_children = 0;
454 tn->empty_children = 1<<bits;
455 }
c877efb2 456
8d965444
ED
457 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
458 (unsigned long) (sizeof(struct node) << bits));
19baf839
RO
459 return tn;
460}
461
19baf839
RO
462/*
463 * Check whether a tnode 'n' is "full", i.e. it is an internal node
464 * and no bits are skipped. See discussion in dyntree paper p. 6
465 */
466
bb435b8d 467static inline int tnode_full(const struct tnode *tn, const struct node *n)
19baf839 468{
c877efb2 469 if (n == NULL || IS_LEAF(n))
19baf839
RO
470 return 0;
471
472 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
473}
474
a07f5f50
SH
475static inline void put_child(struct trie *t, struct tnode *tn, int i,
476 struct node *n)
19baf839
RO
477{
478 tnode_put_child_reorg(tn, i, n, -1);
479}
480
c877efb2 481 /*
19baf839
RO
482 * Add a child at position i overwriting the old value.
483 * Update the value of full_children and empty_children.
484 */
485
a07f5f50
SH
486static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
487 int wasfull)
19baf839 488{
2373ce1c 489 struct node *chi = tn->child[i];
19baf839
RO
490 int isfull;
491
0c7770c7
SH
492 BUG_ON(i >= 1<<tn->bits);
493
19baf839
RO
494 /* update emptyChildren */
495 if (n == NULL && chi != NULL)
496 tn->empty_children++;
497 else if (n != NULL && chi == NULL)
498 tn->empty_children--;
c877efb2 499
19baf839 500 /* update fullChildren */
91b9a277 501 if (wasfull == -1)
19baf839
RO
502 wasfull = tnode_full(tn, chi);
503
504 isfull = tnode_full(tn, n);
c877efb2 505 if (wasfull && !isfull)
19baf839 506 tn->full_children--;
c877efb2 507 else if (!wasfull && isfull)
19baf839 508 tn->full_children++;
91b9a277 509
c877efb2 510 if (n)
06801916 511 node_set_parent(n, tn);
19baf839 512
2373ce1c 513 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
514}
515
80b71b80 516#define MAX_WORK 10
c877efb2 517static struct node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
518{
519 int i;
2f80b3c8 520 struct tnode *old_tn;
e6308be8
RO
521 int inflate_threshold_use;
522 int halve_threshold_use;
80b71b80 523 int max_work;
19baf839 524
e905a9ed 525 if (!tn)
19baf839
RO
526 return NULL;
527
0c7770c7
SH
528 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
529 tn, inflate_threshold, halve_threshold);
19baf839
RO
530
531 /* No children */
532 if (tn->empty_children == tnode_child_length(tn)) {
e0f7cb8c 533 tnode_free_safe(tn);
19baf839
RO
534 return NULL;
535 }
536 /* One child */
537 if (tn->empty_children == tnode_child_length(tn) - 1)
80b71b80 538 goto one_child;
c877efb2 539 /*
19baf839
RO
540 * Double as long as the resulting node has a number of
541 * nonempty nodes that are above the threshold.
542 */
543
544 /*
c877efb2
SH
545 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
546 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 547 * Telecommunications, page 6:
c877efb2 548 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
549 * children in the *doubled* node is at least 'high'."
550 *
c877efb2
SH
551 * 'high' in this instance is the variable 'inflate_threshold'. It
552 * is expressed as a percentage, so we multiply it with
553 * tnode_child_length() and instead of multiplying by 2 (since the
554 * child array will be doubled by inflate()) and multiplying
555 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 556 * multiply the left-hand side by 50.
c877efb2
SH
557 *
558 * The left-hand side may look a bit weird: tnode_child_length(tn)
559 * - tn->empty_children is of course the number of non-null children
560 * in the current node. tn->full_children is the number of "full"
19baf839 561 * children, that is non-null tnodes with a skip value of 0.
c877efb2 562 * All of those will be doubled in the resulting inflated tnode, so
19baf839 563 * we just count them one extra time here.
c877efb2 564 *
19baf839 565 * A clearer way to write this would be:
c877efb2 566 *
19baf839 567 * to_be_doubled = tn->full_children;
c877efb2 568 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
569 * tn->full_children;
570 *
571 * new_child_length = tnode_child_length(tn) * 2;
572 *
c877efb2 573 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
574 * new_child_length;
575 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
576 *
577 * ...and so on, tho it would mess up the while () loop.
578 *
19baf839
RO
579 * anyway,
580 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
581 * inflate_threshold
c877efb2 582 *
19baf839
RO
583 * avoid a division:
584 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
585 * inflate_threshold * new_child_length
c877efb2 586 *
19baf839 587 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 588 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 589 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 590 *
19baf839 591 * expand new_child_length:
c877efb2 592 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 593 * tn->full_children) >=
19baf839 594 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 595 *
19baf839 596 * shorten again:
c877efb2 597 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 598 * tn->empty_children) >= inflate_threshold *
19baf839 599 * tnode_child_length(tn)
c877efb2 600 *
19baf839
RO
601 */
602
603 check_tnode(tn);
c877efb2 604
e6308be8
RO
605 /* Keep root node larger */
606
80b71b80
JL
607 if (!node_parent((struct node*) tn)) {
608 inflate_threshold_use = inflate_threshold_root;
609 halve_threshold_use = halve_threshold_root;
610 }
611 else {
e6308be8 612 inflate_threshold_use = inflate_threshold;
80b71b80
JL
613 halve_threshold_use = halve_threshold;
614 }
e6308be8 615
80b71b80
JL
616 max_work = MAX_WORK;
617 while ((tn->full_children > 0 && max_work-- &&
a07f5f50
SH
618 50 * (tn->full_children + tnode_child_length(tn)
619 - tn->empty_children)
620 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 621
2f80b3c8
RO
622 old_tn = tn;
623 tn = inflate(t, tn);
a07f5f50 624
2f80b3c8
RO
625 if (IS_ERR(tn)) {
626 tn = old_tn;
2f36895a
RO
627#ifdef CONFIG_IP_FIB_TRIE_STATS
628 t->stats.resize_node_skipped++;
629#endif
630 break;
631 }
19baf839
RO
632 }
633
634 check_tnode(tn);
635
80b71b80
JL
636 /* Return if at least one inflate is run */
637 if( max_work != MAX_WORK)
638 return (struct node *) tn;
639
19baf839
RO
640 /*
641 * Halve as long as the number of empty children in this
642 * node is above threshold.
643 */
2f36895a 644
80b71b80
JL
645 max_work = MAX_WORK;
646 while (tn->bits > 1 && max_work-- &&
19baf839 647 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 648 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 649
2f80b3c8
RO
650 old_tn = tn;
651 tn = halve(t, tn);
652 if (IS_ERR(tn)) {
653 tn = old_tn;
2f36895a
RO
654#ifdef CONFIG_IP_FIB_TRIE_STATS
655 t->stats.resize_node_skipped++;
656#endif
657 break;
658 }
659 }
19baf839 660
c877efb2 661
19baf839 662 /* Only one child remains */
80b71b80
JL
663 if (tn->empty_children == tnode_child_length(tn) - 1) {
664one_child:
19baf839 665 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 666 struct node *n;
19baf839 667
91b9a277 668 n = tn->child[i];
2373ce1c 669 if (!n)
91b9a277 670 continue;
91b9a277
OJ
671
672 /* compress one level */
673
06801916 674 node_set_parent(n, NULL);
e0f7cb8c 675 tnode_free_safe(tn);
91b9a277 676 return n;
19baf839 677 }
80b71b80 678 }
19baf839
RO
679 return (struct node *) tn;
680}
681
2f80b3c8 682static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839 683{
19baf839
RO
684 struct tnode *oldtnode = tn;
685 int olen = tnode_child_length(tn);
686 int i;
687
0c7770c7 688 pr_debug("In inflate\n");
19baf839
RO
689
690 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
691
0c7770c7 692 if (!tn)
2f80b3c8 693 return ERR_PTR(-ENOMEM);
2f36895a
RO
694
695 /*
c877efb2
SH
696 * Preallocate and store tnodes before the actual work so we
697 * don't get into an inconsistent state if memory allocation
698 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
699 * of tnode is ignored.
700 */
91b9a277
OJ
701
702 for (i = 0; i < olen; i++) {
a07f5f50 703 struct tnode *inode;
2f36895a 704
a07f5f50 705 inode = (struct tnode *) tnode_get_child(oldtnode, i);
2f36895a
RO
706 if (inode &&
707 IS_TNODE(inode) &&
708 inode->pos == oldtnode->pos + oldtnode->bits &&
709 inode->bits > 1) {
710 struct tnode *left, *right;
ab66b4a7 711 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 712
2f36895a
RO
713 left = tnode_new(inode->key&(~m), inode->pos + 1,
714 inode->bits - 1);
2f80b3c8
RO
715 if (!left)
716 goto nomem;
91b9a277 717
2f36895a
RO
718 right = tnode_new(inode->key|m, inode->pos + 1,
719 inode->bits - 1);
720
e905a9ed 721 if (!right) {
2f80b3c8
RO
722 tnode_free(left);
723 goto nomem;
e905a9ed 724 }
2f36895a
RO
725
726 put_child(t, tn, 2*i, (struct node *) left);
727 put_child(t, tn, 2*i+1, (struct node *) right);
728 }
729 }
730
91b9a277 731 for (i = 0; i < olen; i++) {
c95aaf9a 732 struct tnode *inode;
19baf839 733 struct node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
734 struct tnode *left, *right;
735 int size, j;
c877efb2 736
19baf839
RO
737 /* An empty child */
738 if (node == NULL)
739 continue;
740
741 /* A leaf or an internal node with skipped bits */
742
c877efb2 743 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 744 tn->pos + tn->bits - 1) {
a07f5f50
SH
745 if (tkey_extract_bits(node->key,
746 oldtnode->pos + oldtnode->bits,
747 1) == 0)
19baf839
RO
748 put_child(t, tn, 2*i, node);
749 else
750 put_child(t, tn, 2*i+1, node);
751 continue;
752 }
753
754 /* An internal node with two children */
755 inode = (struct tnode *) node;
756
757 if (inode->bits == 1) {
758 put_child(t, tn, 2*i, inode->child[0]);
759 put_child(t, tn, 2*i+1, inode->child[1]);
760
e0f7cb8c 761 tnode_free_safe(inode);
91b9a277 762 continue;
19baf839
RO
763 }
764
91b9a277
OJ
765 /* An internal node with more than two children */
766
767 /* We will replace this node 'inode' with two new
768 * ones, 'left' and 'right', each with half of the
769 * original children. The two new nodes will have
770 * a position one bit further down the key and this
771 * means that the "significant" part of their keys
772 * (see the discussion near the top of this file)
773 * will differ by one bit, which will be "0" in
774 * left's key and "1" in right's key. Since we are
775 * moving the key position by one step, the bit that
776 * we are moving away from - the bit at position
777 * (inode->pos) - is the one that will differ between
778 * left and right. So... we synthesize that bit in the
779 * two new keys.
780 * The mask 'm' below will be a single "one" bit at
781 * the position (inode->pos)
782 */
19baf839 783
91b9a277
OJ
784 /* Use the old key, but set the new significant
785 * bit to zero.
786 */
2f36895a 787
91b9a277
OJ
788 left = (struct tnode *) tnode_get_child(tn, 2*i);
789 put_child(t, tn, 2*i, NULL);
2f36895a 790
91b9a277 791 BUG_ON(!left);
2f36895a 792
91b9a277
OJ
793 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
794 put_child(t, tn, 2*i+1, NULL);
19baf839 795
91b9a277 796 BUG_ON(!right);
19baf839 797
91b9a277
OJ
798 size = tnode_child_length(left);
799 for (j = 0; j < size; j++) {
800 put_child(t, left, j, inode->child[j]);
801 put_child(t, right, j, inode->child[j + size]);
19baf839 802 }
91b9a277
OJ
803 put_child(t, tn, 2*i, resize(t, left));
804 put_child(t, tn, 2*i+1, resize(t, right));
805
e0f7cb8c 806 tnode_free_safe(inode);
19baf839 807 }
e0f7cb8c 808 tnode_free_safe(oldtnode);
19baf839 809 return tn;
2f80b3c8
RO
810nomem:
811 {
812 int size = tnode_child_length(tn);
813 int j;
814
0c7770c7 815 for (j = 0; j < size; j++)
2f80b3c8
RO
816 if (tn->child[j])
817 tnode_free((struct tnode *)tn->child[j]);
818
819 tnode_free(tn);
0c7770c7 820
2f80b3c8
RO
821 return ERR_PTR(-ENOMEM);
822 }
19baf839
RO
823}
824
2f80b3c8 825static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
826{
827 struct tnode *oldtnode = tn;
828 struct node *left, *right;
829 int i;
830 int olen = tnode_child_length(tn);
831
0c7770c7 832 pr_debug("In halve\n");
c877efb2
SH
833
834 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 835
2f80b3c8
RO
836 if (!tn)
837 return ERR_PTR(-ENOMEM);
2f36895a
RO
838
839 /*
c877efb2
SH
840 * Preallocate and store tnodes before the actual work so we
841 * don't get into an inconsistent state if memory allocation
842 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
843 * of tnode is ignored.
844 */
845
91b9a277 846 for (i = 0; i < olen; i += 2) {
2f36895a
RO
847 left = tnode_get_child(oldtnode, i);
848 right = tnode_get_child(oldtnode, i+1);
c877efb2 849
2f36895a 850 /* Two nonempty children */
0c7770c7 851 if (left && right) {
2f80b3c8 852 struct tnode *newn;
0c7770c7 853
2f80b3c8 854 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
855
856 if (!newn)
2f80b3c8 857 goto nomem;
0c7770c7 858
2f80b3c8 859 put_child(t, tn, i/2, (struct node *)newn);
2f36895a 860 }
2f36895a 861
2f36895a 862 }
19baf839 863
91b9a277
OJ
864 for (i = 0; i < olen; i += 2) {
865 struct tnode *newBinNode;
866
19baf839
RO
867 left = tnode_get_child(oldtnode, i);
868 right = tnode_get_child(oldtnode, i+1);
c877efb2 869
19baf839
RO
870 /* At least one of the children is empty */
871 if (left == NULL) {
872 if (right == NULL) /* Both are empty */
873 continue;
874 put_child(t, tn, i/2, right);
91b9a277 875 continue;
0c7770c7 876 }
91b9a277
OJ
877
878 if (right == NULL) {
19baf839 879 put_child(t, tn, i/2, left);
91b9a277
OJ
880 continue;
881 }
c877efb2 882
19baf839 883 /* Two nonempty children */
91b9a277
OJ
884 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
885 put_child(t, tn, i/2, NULL);
91b9a277
OJ
886 put_child(t, newBinNode, 0, left);
887 put_child(t, newBinNode, 1, right);
888 put_child(t, tn, i/2, resize(t, newBinNode));
19baf839 889 }
e0f7cb8c 890 tnode_free_safe(oldtnode);
19baf839 891 return tn;
2f80b3c8
RO
892nomem:
893 {
894 int size = tnode_child_length(tn);
895 int j;
896
0c7770c7 897 for (j = 0; j < size; j++)
2f80b3c8
RO
898 if (tn->child[j])
899 tnode_free((struct tnode *)tn->child[j]);
900
901 tnode_free(tn);
0c7770c7 902
2f80b3c8
RO
903 return ERR_PTR(-ENOMEM);
904 }
19baf839
RO
905}
906
772cb712 907/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
908 via get_fa_head and dump */
909
772cb712 910static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
19baf839 911{
772cb712 912 struct hlist_head *head = &l->list;
19baf839
RO
913 struct hlist_node *node;
914 struct leaf_info *li;
915
2373ce1c 916 hlist_for_each_entry_rcu(li, node, head, hlist)
c877efb2 917 if (li->plen == plen)
19baf839 918 return li;
91b9a277 919
19baf839
RO
920 return NULL;
921}
922
a07f5f50 923static inline struct list_head *get_fa_head(struct leaf *l, int plen)
19baf839 924{
772cb712 925 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 926
91b9a277
OJ
927 if (!li)
928 return NULL;
c877efb2 929
91b9a277 930 return &li->falh;
19baf839
RO
931}
932
933static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
934{
e905a9ed
YH
935 struct leaf_info *li = NULL, *last = NULL;
936 struct hlist_node *node;
937
938 if (hlist_empty(head)) {
939 hlist_add_head_rcu(&new->hlist, head);
940 } else {
941 hlist_for_each_entry(li, node, head, hlist) {
942 if (new->plen > li->plen)
943 break;
944
945 last = li;
946 }
947 if (last)
948 hlist_add_after_rcu(&last->hlist, &new->hlist);
949 else
950 hlist_add_before_rcu(&new->hlist, &li->hlist);
951 }
19baf839
RO
952}
953
2373ce1c
RO
954/* rcu_read_lock needs to be hold by caller from readside */
955
19baf839
RO
956static struct leaf *
957fib_find_node(struct trie *t, u32 key)
958{
959 int pos;
960 struct tnode *tn;
961 struct node *n;
962
963 pos = 0;
634a4b20
PM
964 n = rcu_dereference_check(t->trie,
965 rcu_read_lock_held() ||
966 lockdep_rtnl_is_held());
19baf839
RO
967
968 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
969 tn = (struct tnode *) n;
91b9a277 970
19baf839 971 check_tnode(tn);
91b9a277 972
c877efb2 973 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 974 pos = tn->pos + tn->bits;
a07f5f50
SH
975 n = tnode_get_child_rcu(tn,
976 tkey_extract_bits(key,
977 tn->pos,
978 tn->bits));
91b9a277 979 } else
19baf839
RO
980 break;
981 }
982 /* Case we have found a leaf. Compare prefixes */
983
91b9a277
OJ
984 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
985 return (struct leaf *)n;
986
19baf839
RO
987 return NULL;
988}
989
7b85576d 990static void trie_rebalance(struct trie *t, struct tnode *tn)
19baf839 991{
19baf839 992 int wasfull;
3ed18d76 993 t_key cindex, key;
06801916 994 struct tnode *tp;
19baf839 995
3ed18d76
RO
996 key = tn->key;
997
06801916 998 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
19baf839
RO
999 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1000 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
a07f5f50
SH
1001 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1002
1003 tnode_put_child_reorg((struct tnode *)tp, cindex,
1004 (struct node *)tn, wasfull);
91b9a277 1005
06801916 1006 tp = node_parent((struct node *) tn);
008440e3
JP
1007 if (!tp)
1008 rcu_assign_pointer(t->trie, (struct node *)tn);
1009
e0f7cb8c 1010 tnode_free_flush();
06801916 1011 if (!tp)
19baf839 1012 break;
06801916 1013 tn = tp;
19baf839 1014 }
06801916 1015
19baf839 1016 /* Handle last (top) tnode */
7b85576d 1017 if (IS_TNODE(tn))
a07f5f50 1018 tn = (struct tnode *)resize(t, (struct tnode *)tn);
19baf839 1019
7b85576d
JP
1020 rcu_assign_pointer(t->trie, (struct node *)tn);
1021 tnode_free_flush();
1022
1023 return;
19baf839
RO
1024}
1025
2373ce1c
RO
1026/* only used from updater-side */
1027
fea86ad8 1028static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
1029{
1030 int pos, newpos;
1031 struct tnode *tp = NULL, *tn = NULL;
1032 struct node *n;
1033 struct leaf *l;
1034 int missbit;
c877efb2 1035 struct list_head *fa_head = NULL;
19baf839
RO
1036 struct leaf_info *li;
1037 t_key cindex;
1038
1039 pos = 0;
c877efb2 1040 n = t->trie;
19baf839 1041
c877efb2
SH
1042 /* If we point to NULL, stop. Either the tree is empty and we should
1043 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 1044 * and we should just put our new leaf in that.
c877efb2
SH
1045 * If we point to a T_TNODE, check if it matches our key. Note that
1046 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1047 * not be the parent's 'pos'+'bits'!
1048 *
c877efb2 1049 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1050 * the index from our key, push the T_TNODE and walk the tree.
1051 *
1052 * If it doesn't, we have to replace it with a new T_TNODE.
1053 *
c877efb2
SH
1054 * If we point to a T_LEAF, it might or might not have the same key
1055 * as we do. If it does, just change the value, update the T_LEAF's
1056 * value, and return it.
19baf839
RO
1057 * If it doesn't, we need to replace it with a T_TNODE.
1058 */
1059
1060 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1061 tn = (struct tnode *) n;
91b9a277 1062
c877efb2 1063 check_tnode(tn);
91b9a277 1064
c877efb2 1065 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1066 tp = tn;
91b9a277 1067 pos = tn->pos + tn->bits;
a07f5f50
SH
1068 n = tnode_get_child(tn,
1069 tkey_extract_bits(key,
1070 tn->pos,
1071 tn->bits));
19baf839 1072
06801916 1073 BUG_ON(n && node_parent(n) != tn);
91b9a277 1074 } else
19baf839
RO
1075 break;
1076 }
1077
1078 /*
1079 * n ----> NULL, LEAF or TNODE
1080 *
c877efb2 1081 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1082 */
1083
91b9a277 1084 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1085
1086 /* Case 1: n is a leaf. Compare prefixes */
1087
c877efb2 1088 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
c95aaf9a 1089 l = (struct leaf *) n;
19baf839 1090 li = leaf_info_new(plen);
91b9a277 1091
fea86ad8
SH
1092 if (!li)
1093 return NULL;
19baf839
RO
1094
1095 fa_head = &li->falh;
1096 insert_leaf_info(&l->list, li);
1097 goto done;
1098 }
19baf839
RO
1099 l = leaf_new();
1100
fea86ad8
SH
1101 if (!l)
1102 return NULL;
19baf839
RO
1103
1104 l->key = key;
1105 li = leaf_info_new(plen);
1106
c877efb2 1107 if (!li) {
387a5487 1108 free_leaf(l);
fea86ad8 1109 return NULL;
f835e471 1110 }
19baf839
RO
1111
1112 fa_head = &li->falh;
1113 insert_leaf_info(&l->list, li);
1114
19baf839 1115 if (t->trie && n == NULL) {
91b9a277 1116 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1117
06801916 1118 node_set_parent((struct node *)l, tp);
19baf839 1119
91b9a277
OJ
1120 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1121 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1122 } else {
1123 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1124 /*
1125 * Add a new tnode here
19baf839
RO
1126 * first tnode need some special handling
1127 */
1128
1129 if (tp)
91b9a277 1130 pos = tp->pos+tp->bits;
19baf839 1131 else
91b9a277
OJ
1132 pos = 0;
1133
c877efb2 1134 if (n) {
19baf839
RO
1135 newpos = tkey_mismatch(key, pos, n->key);
1136 tn = tnode_new(n->key, newpos, 1);
91b9a277 1137 } else {
19baf839 1138 newpos = 0;
c877efb2 1139 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1140 }
19baf839 1141
c877efb2 1142 if (!tn) {
f835e471 1143 free_leaf_info(li);
387a5487 1144 free_leaf(l);
fea86ad8 1145 return NULL;
91b9a277
OJ
1146 }
1147
06801916 1148 node_set_parent((struct node *)tn, tp);
19baf839 1149
91b9a277 1150 missbit = tkey_extract_bits(key, newpos, 1);
19baf839
RO
1151 put_child(t, tn, missbit, (struct node *)l);
1152 put_child(t, tn, 1-missbit, n);
1153
c877efb2 1154 if (tp) {
19baf839 1155 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
a07f5f50
SH
1156 put_child(t, (struct tnode *)tp, cindex,
1157 (struct node *)tn);
91b9a277 1158 } else {
a07f5f50 1159 rcu_assign_pointer(t->trie, (struct node *)tn);
19baf839
RO
1160 tp = tn;
1161 }
1162 }
91b9a277
OJ
1163
1164 if (tp && tp->pos + tp->bits > 32)
a07f5f50
SH
1165 pr_warning("fib_trie"
1166 " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1167 tp, tp->pos, tp->bits, key, plen);
91b9a277 1168
19baf839 1169 /* Rebalance the trie */
2373ce1c 1170
7b85576d 1171 trie_rebalance(t, tp);
f835e471 1172done:
19baf839
RO
1173 return fa_head;
1174}
1175
d562f1f8
RO
1176/*
1177 * Caller must hold RTNL.
1178 */
16c6cf8b 1179int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1180{
1181 struct trie *t = (struct trie *) tb->tb_data;
1182 struct fib_alias *fa, *new_fa;
c877efb2 1183 struct list_head *fa_head = NULL;
19baf839 1184 struct fib_info *fi;
4e902c57
TG
1185 int plen = cfg->fc_dst_len;
1186 u8 tos = cfg->fc_tos;
19baf839
RO
1187 u32 key, mask;
1188 int err;
1189 struct leaf *l;
1190
1191 if (plen > 32)
1192 return -EINVAL;
1193
4e902c57 1194 key = ntohl(cfg->fc_dst);
19baf839 1195
2dfe55b4 1196 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1197
91b9a277 1198 mask = ntohl(inet_make_mask(plen));
19baf839 1199
c877efb2 1200 if (key & ~mask)
19baf839
RO
1201 return -EINVAL;
1202
1203 key = key & mask;
1204
4e902c57
TG
1205 fi = fib_create_info(cfg);
1206 if (IS_ERR(fi)) {
1207 err = PTR_ERR(fi);
19baf839 1208 goto err;
4e902c57 1209 }
19baf839
RO
1210
1211 l = fib_find_node(t, key);
c877efb2 1212 fa = NULL;
19baf839 1213
c877efb2 1214 if (l) {
19baf839
RO
1215 fa_head = get_fa_head(l, plen);
1216 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1217 }
1218
1219 /* Now fa, if non-NULL, points to the first fib alias
1220 * with the same keys [prefix,tos,priority], if such key already
1221 * exists or to the node before which we will insert new one.
1222 *
1223 * If fa is NULL, we will need to allocate a new one and
1224 * insert to the head of f.
1225 *
1226 * If f is NULL, no fib node matched the destination key
1227 * and we need to allocate a new one of those as well.
1228 */
1229
936f6f8e
JA
1230 if (fa && fa->fa_tos == tos &&
1231 fa->fa_info->fib_priority == fi->fib_priority) {
1232 struct fib_alias *fa_first, *fa_match;
19baf839
RO
1233
1234 err = -EEXIST;
4e902c57 1235 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1236 goto out;
1237
936f6f8e
JA
1238 /* We have 2 goals:
1239 * 1. Find exact match for type, scope, fib_info to avoid
1240 * duplicate routes
1241 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1242 */
1243 fa_match = NULL;
1244 fa_first = fa;
1245 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1246 list_for_each_entry_continue(fa, fa_head, fa_list) {
1247 if (fa->fa_tos != tos)
1248 break;
1249 if (fa->fa_info->fib_priority != fi->fib_priority)
1250 break;
1251 if (fa->fa_type == cfg->fc_type &&
1252 fa->fa_scope == cfg->fc_scope &&
1253 fa->fa_info == fi) {
1254 fa_match = fa;
1255 break;
1256 }
1257 }
1258
4e902c57 1259 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1260 struct fib_info *fi_drop;
1261 u8 state;
1262
936f6f8e
JA
1263 fa = fa_first;
1264 if (fa_match) {
1265 if (fa == fa_match)
1266 err = 0;
6725033f 1267 goto out;
936f6f8e 1268 }
2373ce1c 1269 err = -ENOBUFS;
e94b1766 1270 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1271 if (new_fa == NULL)
1272 goto out;
19baf839
RO
1273
1274 fi_drop = fa->fa_info;
2373ce1c
RO
1275 new_fa->fa_tos = fa->fa_tos;
1276 new_fa->fa_info = fi;
4e902c57
TG
1277 new_fa->fa_type = cfg->fc_type;
1278 new_fa->fa_scope = cfg->fc_scope;
19baf839 1279 state = fa->fa_state;
936f6f8e 1280 new_fa->fa_state = state & ~FA_S_ACCESSED;
19baf839 1281
2373ce1c
RO
1282 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1283 alias_free_mem_rcu(fa);
19baf839
RO
1284
1285 fib_release_info(fi_drop);
1286 if (state & FA_S_ACCESSED)
76e6ebfb 1287 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
b8f55831
MK
1288 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1289 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1290
91b9a277 1291 goto succeeded;
19baf839
RO
1292 }
1293 /* Error if we find a perfect match which
1294 * uses the same scope, type, and nexthop
1295 * information.
1296 */
936f6f8e
JA
1297 if (fa_match)
1298 goto out;
a07f5f50 1299
4e902c57 1300 if (!(cfg->fc_nlflags & NLM_F_APPEND))
936f6f8e 1301 fa = fa_first;
19baf839
RO
1302 }
1303 err = -ENOENT;
4e902c57 1304 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1305 goto out;
1306
1307 err = -ENOBUFS;
e94b1766 1308 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1309 if (new_fa == NULL)
1310 goto out;
1311
1312 new_fa->fa_info = fi;
1313 new_fa->fa_tos = tos;
4e902c57
TG
1314 new_fa->fa_type = cfg->fc_type;
1315 new_fa->fa_scope = cfg->fc_scope;
19baf839 1316 new_fa->fa_state = 0;
19baf839
RO
1317 /*
1318 * Insert new entry to the list.
1319 */
1320
c877efb2 1321 if (!fa_head) {
fea86ad8
SH
1322 fa_head = fib_insert_node(t, key, plen);
1323 if (unlikely(!fa_head)) {
1324 err = -ENOMEM;
f835e471 1325 goto out_free_new_fa;
fea86ad8 1326 }
f835e471 1327 }
19baf839 1328
2373ce1c
RO
1329 list_add_tail_rcu(&new_fa->fa_list,
1330 (fa ? &fa->fa_list : fa_head));
19baf839 1331
76e6ebfb 1332 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
4e902c57 1333 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1334 &cfg->fc_nlinfo, 0);
19baf839
RO
1335succeeded:
1336 return 0;
f835e471
RO
1337
1338out_free_new_fa:
1339 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1340out:
1341 fib_release_info(fi);
91b9a277 1342err:
19baf839
RO
1343 return err;
1344}
1345
772cb712 1346/* should be called with rcu_read_lock */
a07f5f50
SH
1347static int check_leaf(struct trie *t, struct leaf *l,
1348 t_key key, const struct flowi *flp,
1349 struct fib_result *res)
19baf839 1350{
19baf839
RO
1351 struct leaf_info *li;
1352 struct hlist_head *hhead = &l->list;
1353 struct hlist_node *node;
c877efb2 1354
2373ce1c 1355 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
a07f5f50
SH
1356 int err;
1357 int plen = li->plen;
1358 __be32 mask = inet_make_mask(plen);
1359
888454c5 1360 if (l->key != (key & ntohl(mask)))
19baf839
RO
1361 continue;
1362
e204a345 1363 err = fib_semantic_match(&li->falh, flp, res, plen);
a07f5f50 1364
19baf839 1365#ifdef CONFIG_IP_FIB_TRIE_STATS
a07f5f50 1366 if (err <= 0)
19baf839 1367 t->stats.semantic_match_passed++;
a07f5f50
SH
1368 else
1369 t->stats.semantic_match_miss++;
19baf839 1370#endif
a07f5f50 1371 if (err <= 0)
2e655571 1372 return err;
19baf839 1373 }
a07f5f50 1374
2e655571 1375 return 1;
19baf839
RO
1376}
1377
16c6cf8b
SH
1378int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1379 struct fib_result *res)
19baf839
RO
1380{
1381 struct trie *t = (struct trie *) tb->tb_data;
2e655571 1382 int ret;
19baf839
RO
1383 struct node *n;
1384 struct tnode *pn;
1385 int pos, bits;
91b9a277 1386 t_key key = ntohl(flp->fl4_dst);
19baf839
RO
1387 int chopped_off;
1388 t_key cindex = 0;
1389 int current_prefix_length = KEYLENGTH;
91b9a277
OJ
1390 struct tnode *cn;
1391 t_key node_prefix, key_prefix, pref_mismatch;
1392 int mp;
1393
2373ce1c 1394 rcu_read_lock();
91b9a277 1395
2373ce1c 1396 n = rcu_dereference(t->trie);
c877efb2 1397 if (!n)
19baf839
RO
1398 goto failed;
1399
1400#ifdef CONFIG_IP_FIB_TRIE_STATS
1401 t->stats.gets++;
1402#endif
1403
1404 /* Just a leaf? */
1405 if (IS_LEAF(n)) {
2e655571 1406 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
a07f5f50 1407 goto found;
19baf839 1408 }
a07f5f50 1409
19baf839
RO
1410 pn = (struct tnode *) n;
1411 chopped_off = 0;
c877efb2 1412
91b9a277 1413 while (pn) {
19baf839
RO
1414 pos = pn->pos;
1415 bits = pn->bits;
1416
c877efb2 1417 if (!chopped_off)
ab66b4a7
SH
1418 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1419 pos, bits);
19baf839 1420
b902e573 1421 n = tnode_get_child_rcu(pn, cindex);
19baf839
RO
1422
1423 if (n == NULL) {
1424#ifdef CONFIG_IP_FIB_TRIE_STATS
1425 t->stats.null_node_hit++;
1426#endif
1427 goto backtrace;
1428 }
1429
91b9a277 1430 if (IS_LEAF(n)) {
2e655571
BH
1431 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1432 if (ret > 0)
91b9a277 1433 goto backtrace;
a07f5f50 1434 goto found;
91b9a277
OJ
1435 }
1436
91b9a277 1437 cn = (struct tnode *)n;
19baf839 1438
91b9a277
OJ
1439 /*
1440 * It's a tnode, and we can do some extra checks here if we
1441 * like, to avoid descending into a dead-end branch.
1442 * This tnode is in the parent's child array at index
1443 * key[p_pos..p_pos+p_bits] but potentially with some bits
1444 * chopped off, so in reality the index may be just a
1445 * subprefix, padded with zero at the end.
1446 * We can also take a look at any skipped bits in this
1447 * tnode - everything up to p_pos is supposed to be ok,
1448 * and the non-chopped bits of the index (se previous
1449 * paragraph) are also guaranteed ok, but the rest is
1450 * considered unknown.
1451 *
1452 * The skipped bits are key[pos+bits..cn->pos].
1453 */
19baf839 1454
91b9a277
OJ
1455 /* If current_prefix_length < pos+bits, we are already doing
1456 * actual prefix matching, which means everything from
1457 * pos+(bits-chopped_off) onward must be zero along some
1458 * branch of this subtree - otherwise there is *no* valid
1459 * prefix present. Here we can only check the skipped
1460 * bits. Remember, since we have already indexed into the
1461 * parent's child array, we know that the bits we chopped of
1462 * *are* zero.
1463 */
19baf839 1464
a07f5f50
SH
1465 /* NOTA BENE: Checking only skipped bits
1466 for the new node here */
19baf839 1467
91b9a277
OJ
1468 if (current_prefix_length < pos+bits) {
1469 if (tkey_extract_bits(cn->key, current_prefix_length,
a07f5f50
SH
1470 cn->pos - current_prefix_length)
1471 || !(cn->child[0]))
91b9a277
OJ
1472 goto backtrace;
1473 }
19baf839 1474
91b9a277
OJ
1475 /*
1476 * If chopped_off=0, the index is fully validated and we
1477 * only need to look at the skipped bits for this, the new,
1478 * tnode. What we actually want to do is to find out if
1479 * these skipped bits match our key perfectly, or if we will
1480 * have to count on finding a matching prefix further down,
1481 * because if we do, we would like to have some way of
1482 * verifying the existence of such a prefix at this point.
1483 */
19baf839 1484
91b9a277
OJ
1485 /* The only thing we can do at this point is to verify that
1486 * any such matching prefix can indeed be a prefix to our
1487 * key, and if the bits in the node we are inspecting that
1488 * do not match our key are not ZERO, this cannot be true.
1489 * Thus, find out where there is a mismatch (before cn->pos)
1490 * and verify that all the mismatching bits are zero in the
1491 * new tnode's key.
1492 */
19baf839 1493
a07f5f50
SH
1494 /*
1495 * Note: We aren't very concerned about the piece of
1496 * the key that precede pn->pos+pn->bits, since these
1497 * have already been checked. The bits after cn->pos
1498 * aren't checked since these are by definition
1499 * "unknown" at this point. Thus, what we want to see
1500 * is if we are about to enter the "prefix matching"
1501 * state, and in that case verify that the skipped
1502 * bits that will prevail throughout this subtree are
1503 * zero, as they have to be if we are to find a
1504 * matching prefix.
91b9a277
OJ
1505 */
1506
ab66b4a7
SH
1507 node_prefix = mask_pfx(cn->key, cn->pos);
1508 key_prefix = mask_pfx(key, cn->pos);
91b9a277
OJ
1509 pref_mismatch = key_prefix^node_prefix;
1510 mp = 0;
1511
a07f5f50
SH
1512 /*
1513 * In short: If skipped bits in this node do not match
1514 * the search key, enter the "prefix matching"
1515 * state.directly.
91b9a277
OJ
1516 */
1517 if (pref_mismatch) {
1518 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1519 mp++;
a07f5f50 1520 pref_mismatch = pref_mismatch << 1;
91b9a277
OJ
1521 }
1522 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1523
1524 if (key_prefix != 0)
1525 goto backtrace;
1526
1527 if (current_prefix_length >= cn->pos)
1528 current_prefix_length = mp;
c877efb2 1529 }
a07f5f50 1530
91b9a277
OJ
1531 pn = (struct tnode *)n; /* Descend */
1532 chopped_off = 0;
1533 continue;
1534
19baf839
RO
1535backtrace:
1536 chopped_off++;
1537
1538 /* As zero don't change the child key (cindex) */
a07f5f50
SH
1539 while ((chopped_off <= pn->bits)
1540 && !(cindex & (1<<(chopped_off-1))))
19baf839 1541 chopped_off++;
19baf839
RO
1542
1543 /* Decrease current_... with bits chopped off */
1544 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
a07f5f50
SH
1545 current_prefix_length = pn->pos + pn->bits
1546 - chopped_off;
91b9a277 1547
19baf839 1548 /*
c877efb2 1549 * Either we do the actual chop off according or if we have
19baf839
RO
1550 * chopped off all bits in this tnode walk up to our parent.
1551 */
1552
91b9a277 1553 if (chopped_off <= pn->bits) {
19baf839 1554 cindex &= ~(1 << (chopped_off-1));
91b9a277 1555 } else {
b902e573 1556 struct tnode *parent = node_parent_rcu((struct node *) pn);
06801916 1557 if (!parent)
19baf839 1558 goto failed;
91b9a277 1559
19baf839 1560 /* Get Child's index */
06801916
SH
1561 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1562 pn = parent;
19baf839
RO
1563 chopped_off = 0;
1564
1565#ifdef CONFIG_IP_FIB_TRIE_STATS
1566 t->stats.backtrack++;
1567#endif
1568 goto backtrace;
c877efb2 1569 }
19baf839
RO
1570 }
1571failed:
c877efb2 1572 ret = 1;
19baf839 1573found:
2373ce1c 1574 rcu_read_unlock();
19baf839
RO
1575 return ret;
1576}
1577
9195bef7
SH
1578/*
1579 * Remove the leaf and return parent.
1580 */
1581static void trie_leaf_remove(struct trie *t, struct leaf *l)
19baf839 1582{
9195bef7 1583 struct tnode *tp = node_parent((struct node *) l);
c877efb2 1584
9195bef7 1585 pr_debug("entering trie_leaf_remove(%p)\n", l);
19baf839 1586
c877efb2 1587 if (tp) {
9195bef7 1588 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
19baf839 1589 put_child(t, (struct tnode *)tp, cindex, NULL);
7b85576d 1590 trie_rebalance(t, tp);
91b9a277 1591 } else
2373ce1c 1592 rcu_assign_pointer(t->trie, NULL);
19baf839 1593
387a5487 1594 free_leaf(l);
19baf839
RO
1595}
1596
d562f1f8
RO
1597/*
1598 * Caller must hold RTNL.
1599 */
16c6cf8b 1600int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1601{
1602 struct trie *t = (struct trie *) tb->tb_data;
1603 u32 key, mask;
4e902c57
TG
1604 int plen = cfg->fc_dst_len;
1605 u8 tos = cfg->fc_tos;
19baf839
RO
1606 struct fib_alias *fa, *fa_to_delete;
1607 struct list_head *fa_head;
1608 struct leaf *l;
91b9a277
OJ
1609 struct leaf_info *li;
1610
c877efb2 1611 if (plen > 32)
19baf839
RO
1612 return -EINVAL;
1613
4e902c57 1614 key = ntohl(cfg->fc_dst);
91b9a277 1615 mask = ntohl(inet_make_mask(plen));
19baf839 1616
c877efb2 1617 if (key & ~mask)
19baf839
RO
1618 return -EINVAL;
1619
1620 key = key & mask;
1621 l = fib_find_node(t, key);
1622
c877efb2 1623 if (!l)
19baf839
RO
1624 return -ESRCH;
1625
1626 fa_head = get_fa_head(l, plen);
1627 fa = fib_find_alias(fa_head, tos, 0);
1628
1629 if (!fa)
1630 return -ESRCH;
1631
0c7770c7 1632 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1633
1634 fa_to_delete = NULL;
936f6f8e
JA
1635 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1636 list_for_each_entry_continue(fa, fa_head, fa_list) {
19baf839
RO
1637 struct fib_info *fi = fa->fa_info;
1638
1639 if (fa->fa_tos != tos)
1640 break;
1641
4e902c57
TG
1642 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1643 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1644 fa->fa_scope == cfg->fc_scope) &&
1645 (!cfg->fc_protocol ||
1646 fi->fib_protocol == cfg->fc_protocol) &&
1647 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1648 fa_to_delete = fa;
1649 break;
1650 }
1651 }
1652
91b9a277
OJ
1653 if (!fa_to_delete)
1654 return -ESRCH;
19baf839 1655
91b9a277 1656 fa = fa_to_delete;
4e902c57 1657 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1658 &cfg->fc_nlinfo, 0);
91b9a277
OJ
1659
1660 l = fib_find_node(t, key);
772cb712 1661 li = find_leaf_info(l, plen);
19baf839 1662
2373ce1c 1663 list_del_rcu(&fa->fa_list);
19baf839 1664
91b9a277 1665 if (list_empty(fa_head)) {
2373ce1c 1666 hlist_del_rcu(&li->hlist);
91b9a277 1667 free_leaf_info(li);
2373ce1c 1668 }
19baf839 1669
91b9a277 1670 if (hlist_empty(&l->list))
9195bef7 1671 trie_leaf_remove(t, l);
19baf839 1672
91b9a277 1673 if (fa->fa_state & FA_S_ACCESSED)
76e6ebfb 1674 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
19baf839 1675
2373ce1c
RO
1676 fib_release_info(fa->fa_info);
1677 alias_free_mem_rcu(fa);
91b9a277 1678 return 0;
19baf839
RO
1679}
1680
ef3660ce 1681static int trie_flush_list(struct list_head *head)
19baf839
RO
1682{
1683 struct fib_alias *fa, *fa_node;
1684 int found = 0;
1685
1686 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1687 struct fib_info *fi = fa->fa_info;
19baf839 1688
2373ce1c
RO
1689 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1690 list_del_rcu(&fa->fa_list);
1691 fib_release_info(fa->fa_info);
1692 alias_free_mem_rcu(fa);
19baf839
RO
1693 found++;
1694 }
1695 }
1696 return found;
1697}
1698
ef3660ce 1699static int trie_flush_leaf(struct leaf *l)
19baf839
RO
1700{
1701 int found = 0;
1702 struct hlist_head *lih = &l->list;
1703 struct hlist_node *node, *tmp;
1704 struct leaf_info *li = NULL;
1705
1706 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
ef3660ce 1707 found += trie_flush_list(&li->falh);
19baf839
RO
1708
1709 if (list_empty(&li->falh)) {
2373ce1c 1710 hlist_del_rcu(&li->hlist);
19baf839
RO
1711 free_leaf_info(li);
1712 }
1713 }
1714 return found;
1715}
1716
82cfbb00
SH
1717/*
1718 * Scan for the next right leaf starting at node p->child[idx]
1719 * Since we have back pointer, no recursion necessary.
1720 */
1721static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
19baf839 1722{
82cfbb00
SH
1723 do {
1724 t_key idx;
c877efb2 1725
c877efb2 1726 if (c)
82cfbb00 1727 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
c877efb2 1728 else
82cfbb00 1729 idx = 0;
2373ce1c 1730
82cfbb00
SH
1731 while (idx < 1u << p->bits) {
1732 c = tnode_get_child_rcu(p, idx++);
2373ce1c 1733 if (!c)
91b9a277
OJ
1734 continue;
1735
82cfbb00
SH
1736 if (IS_LEAF(c)) {
1737 prefetch(p->child[idx]);
1738 return (struct leaf *) c;
19baf839 1739 }
82cfbb00
SH
1740
1741 /* Rescan start scanning in new node */
1742 p = (struct tnode *) c;
1743 idx = 0;
19baf839 1744 }
82cfbb00
SH
1745
1746 /* Node empty, walk back up to parent */
91b9a277 1747 c = (struct node *) p;
82cfbb00
SH
1748 } while ( (p = node_parent_rcu(c)) != NULL);
1749
1750 return NULL; /* Root of trie */
1751}
1752
82cfbb00
SH
1753static struct leaf *trie_firstleaf(struct trie *t)
1754{
1755 struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1756
1757 if (!n)
1758 return NULL;
1759
1760 if (IS_LEAF(n)) /* trie is just a leaf */
1761 return (struct leaf *) n;
1762
1763 return leaf_walk_rcu(n, NULL);
1764}
1765
1766static struct leaf *trie_nextleaf(struct leaf *l)
1767{
1768 struct node *c = (struct node *) l;
b902e573 1769 struct tnode *p = node_parent_rcu(c);
82cfbb00
SH
1770
1771 if (!p)
1772 return NULL; /* trie with just one leaf */
1773
1774 return leaf_walk_rcu(p, c);
19baf839
RO
1775}
1776
71d67e66
SH
1777static struct leaf *trie_leafindex(struct trie *t, int index)
1778{
1779 struct leaf *l = trie_firstleaf(t);
1780
ec28cf73 1781 while (l && index-- > 0)
71d67e66 1782 l = trie_nextleaf(l);
ec28cf73 1783
71d67e66
SH
1784 return l;
1785}
1786
1787
d562f1f8
RO
1788/*
1789 * Caller must hold RTNL.
1790 */
16c6cf8b 1791int fib_table_flush(struct fib_table *tb)
19baf839
RO
1792{
1793 struct trie *t = (struct trie *) tb->tb_data;
9195bef7 1794 struct leaf *l, *ll = NULL;
82cfbb00 1795 int found = 0;
19baf839 1796
82cfbb00 1797 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
ef3660ce 1798 found += trie_flush_leaf(l);
19baf839
RO
1799
1800 if (ll && hlist_empty(&ll->list))
9195bef7 1801 trie_leaf_remove(t, ll);
19baf839
RO
1802 ll = l;
1803 }
1804
1805 if (ll && hlist_empty(&ll->list))
9195bef7 1806 trie_leaf_remove(t, ll);
19baf839 1807
0c7770c7 1808 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1809 return found;
1810}
1811
16c6cf8b
SH
1812void fib_table_select_default(struct fib_table *tb,
1813 const struct flowi *flp,
1814 struct fib_result *res)
19baf839
RO
1815{
1816 struct trie *t = (struct trie *) tb->tb_data;
1817 int order, last_idx;
1818 struct fib_info *fi = NULL;
1819 struct fib_info *last_resort;
1820 struct fib_alias *fa = NULL;
1821 struct list_head *fa_head;
1822 struct leaf *l;
1823
1824 last_idx = -1;
1825 last_resort = NULL;
1826 order = -1;
1827
2373ce1c 1828 rcu_read_lock();
c877efb2 1829
19baf839 1830 l = fib_find_node(t, 0);
c877efb2 1831 if (!l)
19baf839
RO
1832 goto out;
1833
1834 fa_head = get_fa_head(l, 0);
c877efb2 1835 if (!fa_head)
19baf839
RO
1836 goto out;
1837
c877efb2 1838 if (list_empty(fa_head))
19baf839
RO
1839 goto out;
1840
2373ce1c 1841 list_for_each_entry_rcu(fa, fa_head, fa_list) {
19baf839 1842 struct fib_info *next_fi = fa->fa_info;
91b9a277 1843
19baf839
RO
1844 if (fa->fa_scope != res->scope ||
1845 fa->fa_type != RTN_UNICAST)
1846 continue;
91b9a277 1847
19baf839
RO
1848 if (next_fi->fib_priority > res->fi->fib_priority)
1849 break;
1850 if (!next_fi->fib_nh[0].nh_gw ||
1851 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1852 continue;
1853 fa->fa_state |= FA_S_ACCESSED;
91b9a277 1854
19baf839
RO
1855 if (fi == NULL) {
1856 if (next_fi != res->fi)
1857 break;
1858 } else if (!fib_detect_death(fi, order, &last_resort,
971b893e 1859 &last_idx, tb->tb_default)) {
a2bbe682 1860 fib_result_assign(res, fi);
971b893e 1861 tb->tb_default = order;
19baf839
RO
1862 goto out;
1863 }
1864 fi = next_fi;
1865 order++;
1866 }
1867 if (order <= 0 || fi == NULL) {
971b893e 1868 tb->tb_default = -1;
19baf839
RO
1869 goto out;
1870 }
1871
971b893e
DL
1872 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1873 tb->tb_default)) {
a2bbe682 1874 fib_result_assign(res, fi);
971b893e 1875 tb->tb_default = order;
19baf839
RO
1876 goto out;
1877 }
a2bbe682
DL
1878 if (last_idx >= 0)
1879 fib_result_assign(res, last_resort);
971b893e
DL
1880 tb->tb_default = last_idx;
1881out:
2373ce1c 1882 rcu_read_unlock();
19baf839
RO
1883}
1884
a07f5f50
SH
1885static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1886 struct fib_table *tb,
19baf839
RO
1887 struct sk_buff *skb, struct netlink_callback *cb)
1888{
1889 int i, s_i;
1890 struct fib_alias *fa;
32ab5f80 1891 __be32 xkey = htonl(key);
19baf839 1892
71d67e66 1893 s_i = cb->args[5];
19baf839
RO
1894 i = 0;
1895
2373ce1c
RO
1896 /* rcu_read_lock is hold by caller */
1897
1898 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1899 if (i < s_i) {
1900 i++;
1901 continue;
1902 }
19baf839
RO
1903
1904 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1905 cb->nlh->nlmsg_seq,
1906 RTM_NEWROUTE,
1907 tb->tb_id,
1908 fa->fa_type,
1909 fa->fa_scope,
be403ea1 1910 xkey,
19baf839
RO
1911 plen,
1912 fa->fa_tos,
64347f78 1913 fa->fa_info, NLM_F_MULTI) < 0) {
71d67e66 1914 cb->args[5] = i;
19baf839 1915 return -1;
91b9a277 1916 }
19baf839
RO
1917 i++;
1918 }
71d67e66 1919 cb->args[5] = i;
19baf839
RO
1920 return skb->len;
1921}
1922
a88ee229
SH
1923static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1924 struct sk_buff *skb, struct netlink_callback *cb)
19baf839 1925{
a88ee229
SH
1926 struct leaf_info *li;
1927 struct hlist_node *node;
1928 int i, s_i;
19baf839 1929
71d67e66 1930 s_i = cb->args[4];
a88ee229 1931 i = 0;
19baf839 1932
a88ee229
SH
1933 /* rcu_read_lock is hold by caller */
1934 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1935 if (i < s_i) {
1936 i++;
19baf839 1937 continue;
a88ee229 1938 }
91b9a277 1939
a88ee229 1940 if (i > s_i)
71d67e66 1941 cb->args[5] = 0;
19baf839 1942
a88ee229 1943 if (list_empty(&li->falh))
19baf839
RO
1944 continue;
1945
a88ee229 1946 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
71d67e66 1947 cb->args[4] = i;
19baf839
RO
1948 return -1;
1949 }
a88ee229 1950 i++;
19baf839 1951 }
a88ee229 1952
71d67e66 1953 cb->args[4] = i;
19baf839
RO
1954 return skb->len;
1955}
1956
16c6cf8b
SH
1957int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1958 struct netlink_callback *cb)
19baf839 1959{
a88ee229 1960 struct leaf *l;
19baf839 1961 struct trie *t = (struct trie *) tb->tb_data;
d5ce8a0e 1962 t_key key = cb->args[2];
71d67e66 1963 int count = cb->args[3];
19baf839 1964
2373ce1c 1965 rcu_read_lock();
d5ce8a0e
SH
1966 /* Dump starting at last key.
1967 * Note: 0.0.0.0/0 (ie default) is first key.
1968 */
71d67e66 1969 if (count == 0)
d5ce8a0e
SH
1970 l = trie_firstleaf(t);
1971 else {
71d67e66
SH
1972 /* Normally, continue from last key, but if that is missing
1973 * fallback to using slow rescan
1974 */
d5ce8a0e 1975 l = fib_find_node(t, key);
71d67e66
SH
1976 if (!l)
1977 l = trie_leafindex(t, count);
d5ce8a0e 1978 }
a88ee229 1979
d5ce8a0e
SH
1980 while (l) {
1981 cb->args[2] = l->key;
a88ee229 1982 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
71d67e66 1983 cb->args[3] = count;
a88ee229 1984 rcu_read_unlock();
a88ee229 1985 return -1;
19baf839 1986 }
d5ce8a0e 1987
71d67e66 1988 ++count;
d5ce8a0e 1989 l = trie_nextleaf(l);
71d67e66
SH
1990 memset(&cb->args[4], 0,
1991 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839 1992 }
71d67e66 1993 cb->args[3] = count;
2373ce1c 1994 rcu_read_unlock();
a88ee229 1995
19baf839 1996 return skb->len;
19baf839
RO
1997}
1998
7f9b8052
SH
1999void __init fib_hash_init(void)
2000{
a07f5f50
SH
2001 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2002 sizeof(struct fib_alias),
bc3c8c1e
SH
2003 0, SLAB_PANIC, NULL);
2004
2005 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2006 max(sizeof(struct leaf),
2007 sizeof(struct leaf_info)),
2008 0, SLAB_PANIC, NULL);
7f9b8052 2009}
19baf839 2010
7f9b8052
SH
2011
2012/* Fix more generic FIB names for init later */
2013struct fib_table *fib_hash_table(u32 id)
19baf839
RO
2014{
2015 struct fib_table *tb;
2016 struct trie *t;
2017
19baf839
RO
2018 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2019 GFP_KERNEL);
2020 if (tb == NULL)
2021 return NULL;
2022
2023 tb->tb_id = id;
971b893e 2024 tb->tb_default = -1;
19baf839
RO
2025
2026 t = (struct trie *) tb->tb_data;
c28a1cf4 2027 memset(t, 0, sizeof(*t));
19baf839 2028
19baf839 2029 if (id == RT_TABLE_LOCAL)
a07f5f50 2030 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
19baf839
RO
2031
2032 return tb;
2033}
2034
cb7b593c
SH
2035#ifdef CONFIG_PROC_FS
2036/* Depth first Trie walk iterator */
2037struct fib_trie_iter {
1c340b2f 2038 struct seq_net_private p;
3d3b2d25 2039 struct fib_table *tb;
cb7b593c 2040 struct tnode *tnode;
cb7b593c
SH
2041 unsigned index;
2042 unsigned depth;
2043};
19baf839 2044
cb7b593c 2045static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 2046{
cb7b593c
SH
2047 struct tnode *tn = iter->tnode;
2048 unsigned cindex = iter->index;
2049 struct tnode *p;
19baf839 2050
6640e697
EB
2051 /* A single entry routing table */
2052 if (!tn)
2053 return NULL;
2054
cb7b593c
SH
2055 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2056 iter->tnode, iter->index, iter->depth);
2057rescan:
2058 while (cindex < (1<<tn->bits)) {
b59cfbf7 2059 struct node *n = tnode_get_child_rcu(tn, cindex);
19baf839 2060
cb7b593c
SH
2061 if (n) {
2062 if (IS_LEAF(n)) {
2063 iter->tnode = tn;
2064 iter->index = cindex + 1;
2065 } else {
2066 /* push down one level */
2067 iter->tnode = (struct tnode *) n;
2068 iter->index = 0;
2069 ++iter->depth;
2070 }
2071 return n;
2072 }
19baf839 2073
cb7b593c
SH
2074 ++cindex;
2075 }
91b9a277 2076
cb7b593c 2077 /* Current node exhausted, pop back up */
b59cfbf7 2078 p = node_parent_rcu((struct node *)tn);
cb7b593c
SH
2079 if (p) {
2080 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2081 tn = p;
2082 --iter->depth;
2083 goto rescan;
19baf839 2084 }
cb7b593c
SH
2085
2086 /* got root? */
2087 return NULL;
19baf839
RO
2088}
2089
cb7b593c
SH
2090static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2091 struct trie *t)
19baf839 2092{
3d3b2d25 2093 struct node *n;
5ddf0eb2 2094
132adf54 2095 if (!t)
5ddf0eb2
RO
2096 return NULL;
2097
2098 n = rcu_dereference(t->trie);
3d3b2d25 2099 if (!n)
5ddf0eb2 2100 return NULL;
19baf839 2101
3d3b2d25
SH
2102 if (IS_TNODE(n)) {
2103 iter->tnode = (struct tnode *) n;
2104 iter->index = 0;
2105 iter->depth = 1;
2106 } else {
2107 iter->tnode = NULL;
2108 iter->index = 0;
2109 iter->depth = 0;
91b9a277 2110 }
3d3b2d25
SH
2111
2112 return n;
cb7b593c 2113}
91b9a277 2114
cb7b593c
SH
2115static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2116{
2117 struct node *n;
2118 struct fib_trie_iter iter;
91b9a277 2119
cb7b593c 2120 memset(s, 0, sizeof(*s));
91b9a277 2121
cb7b593c 2122 rcu_read_lock();
3d3b2d25 2123 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
cb7b593c 2124 if (IS_LEAF(n)) {
93672292
SH
2125 struct leaf *l = (struct leaf *)n;
2126 struct leaf_info *li;
2127 struct hlist_node *tmp;
2128
cb7b593c
SH
2129 s->leaves++;
2130 s->totdepth += iter.depth;
2131 if (iter.depth > s->maxdepth)
2132 s->maxdepth = iter.depth;
93672292
SH
2133
2134 hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2135 ++s->prefixes;
cb7b593c
SH
2136 } else {
2137 const struct tnode *tn = (const struct tnode *) n;
2138 int i;
2139
2140 s->tnodes++;
132adf54 2141 if (tn->bits < MAX_STAT_DEPTH)
06ef921d
RO
2142 s->nodesizes[tn->bits]++;
2143
cb7b593c
SH
2144 for (i = 0; i < (1<<tn->bits); i++)
2145 if (!tn->child[i])
2146 s->nullpointers++;
19baf839 2147 }
19baf839 2148 }
2373ce1c 2149 rcu_read_unlock();
19baf839
RO
2150}
2151
cb7b593c
SH
2152/*
2153 * This outputs /proc/net/fib_triestats
2154 */
2155static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2156{
cb7b593c 2157 unsigned i, max, pointers, bytes, avdepth;
c877efb2 2158
cb7b593c
SH
2159 if (stat->leaves)
2160 avdepth = stat->totdepth*100 / stat->leaves;
2161 else
2162 avdepth = 0;
91b9a277 2163
a07f5f50
SH
2164 seq_printf(seq, "\tAver depth: %u.%02d\n",
2165 avdepth / 100, avdepth % 100);
cb7b593c 2166 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2167
cb7b593c 2168 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
cb7b593c 2169 bytes = sizeof(struct leaf) * stat->leaves;
93672292
SH
2170
2171 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2172 bytes += sizeof(struct leaf_info) * stat->prefixes;
2173
187b5188 2174 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 2175 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2176
06ef921d
RO
2177 max = MAX_STAT_DEPTH;
2178 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2179 max--;
19baf839 2180
cb7b593c
SH
2181 pointers = 0;
2182 for (i = 1; i <= max; i++)
2183 if (stat->nodesizes[i] != 0) {
187b5188 2184 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2185 pointers += (1<<i) * stat->nodesizes[i];
2186 }
2187 seq_putc(seq, '\n');
187b5188 2188 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2189
cb7b593c 2190 bytes += sizeof(struct node *) * pointers;
187b5188
SH
2191 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2192 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2193}
2373ce1c 2194
cb7b593c 2195#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd
SH
2196static void trie_show_usage(struct seq_file *seq,
2197 const struct trie_use_stats *stats)
2198{
2199 seq_printf(seq, "\nCounters:\n---------\n");
a07f5f50
SH
2200 seq_printf(seq, "gets = %u\n", stats->gets);
2201 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2202 seq_printf(seq, "semantic match passed = %u\n",
2203 stats->semantic_match_passed);
2204 seq_printf(seq, "semantic match miss = %u\n",
2205 stats->semantic_match_miss);
2206 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2207 seq_printf(seq, "skipped node resize = %u\n\n",
2208 stats->resize_node_skipped);
cb7b593c 2209}
66a2f7fd
SH
2210#endif /* CONFIG_IP_FIB_TRIE_STATS */
2211
3d3b2d25 2212static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
d717a9a6 2213{
3d3b2d25
SH
2214 if (tb->tb_id == RT_TABLE_LOCAL)
2215 seq_puts(seq, "Local:\n");
2216 else if (tb->tb_id == RT_TABLE_MAIN)
2217 seq_puts(seq, "Main:\n");
2218 else
2219 seq_printf(seq, "Id %d:\n", tb->tb_id);
d717a9a6 2220}
19baf839 2221
3d3b2d25 2222
cb7b593c
SH
2223static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2224{
1c340b2f 2225 struct net *net = (struct net *)seq->private;
3d3b2d25 2226 unsigned int h;
877a9bff 2227
d717a9a6 2228 seq_printf(seq,
a07f5f50
SH
2229 "Basic info: size of leaf:"
2230 " %Zd bytes, size of tnode: %Zd bytes.\n",
d717a9a6
SH
2231 sizeof(struct leaf), sizeof(struct tnode));
2232
3d3b2d25
SH
2233 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2234 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2235 struct hlist_node *node;
2236 struct fib_table *tb;
2237
2238 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2239 struct trie *t = (struct trie *) tb->tb_data;
2240 struct trie_stat stat;
877a9bff 2241
3d3b2d25
SH
2242 if (!t)
2243 continue;
2244
2245 fib_table_print(seq, tb);
2246
2247 trie_collect_stats(t, &stat);
2248 trie_show_stats(seq, &stat);
2249#ifdef CONFIG_IP_FIB_TRIE_STATS
2250 trie_show_usage(seq, &t->stats);
2251#endif
2252 }
2253 }
19baf839 2254
cb7b593c 2255 return 0;
19baf839
RO
2256}
2257
cb7b593c 2258static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2259{
de05c557 2260 return single_open_net(inode, file, fib_triestat_seq_show);
1c340b2f
DL
2261}
2262
9a32144e 2263static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2264 .owner = THIS_MODULE,
2265 .open = fib_triestat_seq_open,
2266 .read = seq_read,
2267 .llseek = seq_lseek,
b6fcbdb4 2268 .release = single_release_net,
cb7b593c
SH
2269};
2270
1218854a 2271static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
19baf839 2272{
1218854a
YH
2273 struct fib_trie_iter *iter = seq->private;
2274 struct net *net = seq_file_net(seq);
cb7b593c 2275 loff_t idx = 0;
3d3b2d25 2276 unsigned int h;
cb7b593c 2277
3d3b2d25
SH
2278 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2279 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2280 struct hlist_node *node;
2281 struct fib_table *tb;
cb7b593c 2282
3d3b2d25
SH
2283 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2284 struct node *n;
2285
2286 for (n = fib_trie_get_first(iter,
2287 (struct trie *) tb->tb_data);
2288 n; n = fib_trie_get_next(iter))
2289 if (pos == idx++) {
2290 iter->tb = tb;
2291 return n;
2292 }
2293 }
cb7b593c 2294 }
3d3b2d25 2295
19baf839
RO
2296 return NULL;
2297}
2298
cb7b593c 2299static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2300 __acquires(RCU)
19baf839 2301{
cb7b593c 2302 rcu_read_lock();
1218854a 2303 return fib_trie_get_idx(seq, *pos);
19baf839
RO
2304}
2305
cb7b593c 2306static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2307{
cb7b593c 2308 struct fib_trie_iter *iter = seq->private;
1218854a 2309 struct net *net = seq_file_net(seq);
3d3b2d25
SH
2310 struct fib_table *tb = iter->tb;
2311 struct hlist_node *tb_node;
2312 unsigned int h;
2313 struct node *n;
cb7b593c 2314
19baf839 2315 ++*pos;
3d3b2d25
SH
2316 /* next node in same table */
2317 n = fib_trie_get_next(iter);
2318 if (n)
2319 return n;
19baf839 2320
3d3b2d25
SH
2321 /* walk rest of this hash chain */
2322 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2323 while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2324 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2325 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2326 if (n)
2327 goto found;
2328 }
19baf839 2329
3d3b2d25
SH
2330 /* new hash chain */
2331 while (++h < FIB_TABLE_HASHSZ) {
2332 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2333 hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2334 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2335 if (n)
2336 goto found;
2337 }
2338 }
cb7b593c 2339 return NULL;
3d3b2d25
SH
2340
2341found:
2342 iter->tb = tb;
2343 return n;
cb7b593c 2344}
19baf839 2345
cb7b593c 2346static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2347 __releases(RCU)
19baf839 2348{
cb7b593c
SH
2349 rcu_read_unlock();
2350}
91b9a277 2351
cb7b593c
SH
2352static void seq_indent(struct seq_file *seq, int n)
2353{
2354 while (n-- > 0) seq_puts(seq, " ");
2355}
19baf839 2356
28d36e37 2357static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2358{
132adf54 2359 switch (s) {
cb7b593c
SH
2360 case RT_SCOPE_UNIVERSE: return "universe";
2361 case RT_SCOPE_SITE: return "site";
2362 case RT_SCOPE_LINK: return "link";
2363 case RT_SCOPE_HOST: return "host";
2364 case RT_SCOPE_NOWHERE: return "nowhere";
2365 default:
28d36e37 2366 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2367 return buf;
2368 }
2369}
19baf839 2370
36cbd3dc 2371static const char *const rtn_type_names[__RTN_MAX] = {
cb7b593c
SH
2372 [RTN_UNSPEC] = "UNSPEC",
2373 [RTN_UNICAST] = "UNICAST",
2374 [RTN_LOCAL] = "LOCAL",
2375 [RTN_BROADCAST] = "BROADCAST",
2376 [RTN_ANYCAST] = "ANYCAST",
2377 [RTN_MULTICAST] = "MULTICAST",
2378 [RTN_BLACKHOLE] = "BLACKHOLE",
2379 [RTN_UNREACHABLE] = "UNREACHABLE",
2380 [RTN_PROHIBIT] = "PROHIBIT",
2381 [RTN_THROW] = "THROW",
2382 [RTN_NAT] = "NAT",
2383 [RTN_XRESOLVE] = "XRESOLVE",
2384};
19baf839 2385
28d36e37 2386static inline const char *rtn_type(char *buf, size_t len, unsigned t)
cb7b593c 2387{
cb7b593c
SH
2388 if (t < __RTN_MAX && rtn_type_names[t])
2389 return rtn_type_names[t];
28d36e37 2390 snprintf(buf, len, "type %u", t);
cb7b593c 2391 return buf;
19baf839
RO
2392}
2393
cb7b593c
SH
2394/* Pretty print the trie */
2395static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2396{
cb7b593c
SH
2397 const struct fib_trie_iter *iter = seq->private;
2398 struct node *n = v;
c877efb2 2399
3d3b2d25
SH
2400 if (!node_parent_rcu(n))
2401 fib_table_print(seq, iter->tb);
095b8501 2402
cb7b593c
SH
2403 if (IS_TNODE(n)) {
2404 struct tnode *tn = (struct tnode *) n;
ab66b4a7 2405 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
91b9a277 2406
1d25cd6c 2407 seq_indent(seq, iter->depth-1);
673d57e7
HH
2408 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2409 &prf, tn->pos, tn->bits, tn->full_children,
1d25cd6c 2410 tn->empty_children);
e905a9ed 2411
cb7b593c
SH
2412 } else {
2413 struct leaf *l = (struct leaf *) n;
1328042e
SH
2414 struct leaf_info *li;
2415 struct hlist_node *node;
32ab5f80 2416 __be32 val = htonl(l->key);
cb7b593c
SH
2417
2418 seq_indent(seq, iter->depth);
673d57e7 2419 seq_printf(seq, " |-- %pI4\n", &val);
1328042e
SH
2420
2421 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2422 struct fib_alias *fa;
2423
2424 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2425 char buf1[32], buf2[32];
2426
2427 seq_indent(seq, iter->depth+1);
2428 seq_printf(seq, " /%d %s %s", li->plen,
2429 rtn_scope(buf1, sizeof(buf1),
2430 fa->fa_scope),
2431 rtn_type(buf2, sizeof(buf2),
2432 fa->fa_type));
2433 if (fa->fa_tos)
b9c4d82a 2434 seq_printf(seq, " tos=%d", fa->fa_tos);
1328042e 2435 seq_putc(seq, '\n');
cb7b593c
SH
2436 }
2437 }
19baf839 2438 }
cb7b593c 2439
19baf839
RO
2440 return 0;
2441}
2442
f690808e 2443static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2444 .start = fib_trie_seq_start,
2445 .next = fib_trie_seq_next,
2446 .stop = fib_trie_seq_stop,
2447 .show = fib_trie_seq_show,
19baf839
RO
2448};
2449
cb7b593c 2450static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2451{
1c340b2f
DL
2452 return seq_open_net(inode, file, &fib_trie_seq_ops,
2453 sizeof(struct fib_trie_iter));
19baf839
RO
2454}
2455
9a32144e 2456static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2457 .owner = THIS_MODULE,
2458 .open = fib_trie_seq_open,
2459 .read = seq_read,
2460 .llseek = seq_lseek,
1c340b2f 2461 .release = seq_release_net,
19baf839
RO
2462};
2463
8315f5d8
SH
2464struct fib_route_iter {
2465 struct seq_net_private p;
2466 struct trie *main_trie;
2467 loff_t pos;
2468 t_key key;
2469};
2470
2471static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2472{
2473 struct leaf *l = NULL;
2474 struct trie *t = iter->main_trie;
2475
2476 /* use cache location of last found key */
2477 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2478 pos -= iter->pos;
2479 else {
2480 iter->pos = 0;
2481 l = trie_firstleaf(t);
2482 }
2483
2484 while (l && pos-- > 0) {
2485 iter->pos++;
2486 l = trie_nextleaf(l);
2487 }
2488
2489 if (l)
2490 iter->key = pos; /* remember it */
2491 else
2492 iter->pos = 0; /* forget it */
2493
2494 return l;
2495}
2496
2497static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2498 __acquires(RCU)
2499{
2500 struct fib_route_iter *iter = seq->private;
2501 struct fib_table *tb;
2502
2503 rcu_read_lock();
1218854a 2504 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
8315f5d8
SH
2505 if (!tb)
2506 return NULL;
2507
2508 iter->main_trie = (struct trie *) tb->tb_data;
2509 if (*pos == 0)
2510 return SEQ_START_TOKEN;
2511 else
2512 return fib_route_get_idx(iter, *pos - 1);
2513}
2514
2515static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2516{
2517 struct fib_route_iter *iter = seq->private;
2518 struct leaf *l = v;
2519
2520 ++*pos;
2521 if (v == SEQ_START_TOKEN) {
2522 iter->pos = 0;
2523 l = trie_firstleaf(iter->main_trie);
2524 } else {
2525 iter->pos++;
2526 l = trie_nextleaf(l);
2527 }
2528
2529 if (l)
2530 iter->key = l->key;
2531 else
2532 iter->pos = 0;
2533 return l;
2534}
2535
2536static void fib_route_seq_stop(struct seq_file *seq, void *v)
2537 __releases(RCU)
2538{
2539 rcu_read_unlock();
2540}
2541
32ab5f80 2542static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2543{
cb7b593c
SH
2544 static unsigned type2flags[RTN_MAX + 1] = {
2545 [7] = RTF_REJECT, [8] = RTF_REJECT,
2546 };
2547 unsigned flags = type2flags[type];
19baf839 2548
cb7b593c
SH
2549 if (fi && fi->fib_nh->nh_gw)
2550 flags |= RTF_GATEWAY;
32ab5f80 2551 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2552 flags |= RTF_HOST;
2553 flags |= RTF_UP;
2554 return flags;
19baf839
RO
2555}
2556
cb7b593c
SH
2557/*
2558 * This outputs /proc/net/route.
2559 * The format of the file is not supposed to be changed
2560 * and needs to be same as fib_hash output to avoid breaking
2561 * legacy utilities
2562 */
2563static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2564{
cb7b593c 2565 struct leaf *l = v;
1328042e
SH
2566 struct leaf_info *li;
2567 struct hlist_node *node;
19baf839 2568
cb7b593c
SH
2569 if (v == SEQ_START_TOKEN) {
2570 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2571 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2572 "\tWindow\tIRTT");
2573 return 0;
2574 }
19baf839 2575
1328042e 2576 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
cb7b593c 2577 struct fib_alias *fa;
32ab5f80 2578 __be32 mask, prefix;
91b9a277 2579
cb7b593c
SH
2580 mask = inet_make_mask(li->plen);
2581 prefix = htonl(l->key);
19baf839 2582
cb7b593c 2583 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2584 const struct fib_info *fi = fa->fa_info;
cb7b593c 2585 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
5e659e4c 2586 int len;
19baf839 2587
cb7b593c
SH
2588 if (fa->fa_type == RTN_BROADCAST
2589 || fa->fa_type == RTN_MULTICAST)
2590 continue;
19baf839 2591
cb7b593c 2592 if (fi)
5e659e4c
PE
2593 seq_printf(seq,
2594 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2595 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c
SH
2596 fi->fib_dev ? fi->fib_dev->name : "*",
2597 prefix,
2598 fi->fib_nh->nh_gw, flags, 0, 0,
2599 fi->fib_priority,
2600 mask,
a07f5f50
SH
2601 (fi->fib_advmss ?
2602 fi->fib_advmss + 40 : 0),
cb7b593c 2603 fi->fib_window,
5e659e4c 2604 fi->fib_rtt >> 3, &len);
cb7b593c 2605 else
5e659e4c
PE
2606 seq_printf(seq,
2607 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2608 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c 2609 prefix, 0, flags, 0, 0, 0,
5e659e4c 2610 mask, 0, 0, 0, &len);
19baf839 2611
5e659e4c 2612 seq_printf(seq, "%*s\n", 127 - len, "");
cb7b593c 2613 }
19baf839
RO
2614 }
2615
2616 return 0;
2617}
2618
f690808e 2619static const struct seq_operations fib_route_seq_ops = {
8315f5d8
SH
2620 .start = fib_route_seq_start,
2621 .next = fib_route_seq_next,
2622 .stop = fib_route_seq_stop,
cb7b593c 2623 .show = fib_route_seq_show,
19baf839
RO
2624};
2625
cb7b593c 2626static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2627{
1c340b2f 2628 return seq_open_net(inode, file, &fib_route_seq_ops,
8315f5d8 2629 sizeof(struct fib_route_iter));
19baf839
RO
2630}
2631
9a32144e 2632static const struct file_operations fib_route_fops = {
cb7b593c
SH
2633 .owner = THIS_MODULE,
2634 .open = fib_route_seq_open,
2635 .read = seq_read,
2636 .llseek = seq_lseek,
1c340b2f 2637 .release = seq_release_net,
19baf839
RO
2638};
2639
61a02653 2640int __net_init fib_proc_init(struct net *net)
19baf839 2641{
61a02653 2642 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
cb7b593c
SH
2643 goto out1;
2644
61a02653
DL
2645 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2646 &fib_triestat_fops))
cb7b593c
SH
2647 goto out2;
2648
61a02653 2649 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
cb7b593c
SH
2650 goto out3;
2651
19baf839 2652 return 0;
cb7b593c
SH
2653
2654out3:
61a02653 2655 proc_net_remove(net, "fib_triestat");
cb7b593c 2656out2:
61a02653 2657 proc_net_remove(net, "fib_trie");
cb7b593c
SH
2658out1:
2659 return -ENOMEM;
19baf839
RO
2660}
2661
61a02653 2662void __net_exit fib_proc_exit(struct net *net)
19baf839 2663{
61a02653
DL
2664 proc_net_remove(net, "fib_trie");
2665 proc_net_remove(net, "fib_triestat");
2666 proc_net_remove(net, "route");
19baf839
RO
2667}
2668
2669#endif /* CONFIG_PROC_FS */