]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/fib_trie.c
Merge branch 'x86-mem-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git...
[net-next-2.6.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
19baf839
RO
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
fd966255
RO
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
19baf839
RO
49 */
50
80b71b80 51#define VERSION "0.409"
19baf839 52
19baf839
RO
53#include <asm/uaccess.h>
54#include <asm/system.h>
1977f032 55#include <linux/bitops.h>
19baf839
RO
56#include <linux/types.h>
57#include <linux/kernel.h>
19baf839
RO
58#include <linux/mm.h>
59#include <linux/string.h>
60#include <linux/socket.h>
61#include <linux/sockios.h>
62#include <linux/errno.h>
63#include <linux/in.h>
64#include <linux/inet.h>
cd8787ab 65#include <linux/inetdevice.h>
19baf839
RO
66#include <linux/netdevice.h>
67#include <linux/if_arp.h>
68#include <linux/proc_fs.h>
2373ce1c 69#include <linux/rcupdate.h>
19baf839
RO
70#include <linux/skbuff.h>
71#include <linux/netlink.h>
72#include <linux/init.h>
73#include <linux/list.h>
5a0e3ad6 74#include <linux/slab.h>
457c4cbc 75#include <net/net_namespace.h>
19baf839
RO
76#include <net/ip.h>
77#include <net/protocol.h>
78#include <net/route.h>
79#include <net/tcp.h>
80#include <net/sock.h>
81#include <net/ip_fib.h>
82#include "fib_lookup.h"
83
06ef921d 84#define MAX_STAT_DEPTH 32
19baf839 85
19baf839 86#define KEYLENGTH (8*sizeof(t_key))
19baf839 87
19baf839
RO
88typedef unsigned int t_key;
89
90#define T_TNODE 0
91#define T_LEAF 1
92#define NODE_TYPE_MASK 0x1UL
2373ce1c
RO
93#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
94
91b9a277
OJ
95#define IS_TNODE(n) (!(n->parent & T_LEAF))
96#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839
RO
97
98struct node {
91b9a277 99 unsigned long parent;
8d965444 100 t_key key;
19baf839
RO
101};
102
103struct leaf {
91b9a277 104 unsigned long parent;
8d965444 105 t_key key;
19baf839 106 struct hlist_head list;
2373ce1c 107 struct rcu_head rcu;
19baf839
RO
108};
109
110struct leaf_info {
111 struct hlist_node hlist;
2373ce1c 112 struct rcu_head rcu;
19baf839
RO
113 int plen;
114 struct list_head falh;
115};
116
117struct tnode {
91b9a277 118 unsigned long parent;
8d965444 119 t_key key;
112d8cfc
ED
120 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
121 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
8d965444
ED
122 unsigned int full_children; /* KEYLENGTH bits needed */
123 unsigned int empty_children; /* KEYLENGTH bits needed */
15be75cd
SH
124 union {
125 struct rcu_head rcu;
126 struct work_struct work;
e0f7cb8c 127 struct tnode *tnode_free;
15be75cd 128 };
91b9a277 129 struct node *child[0];
19baf839
RO
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
2f36895a 139 unsigned int resize_node_skipped;
19baf839
RO
140};
141#endif
142
143struct trie_stat {
144 unsigned int totdepth;
145 unsigned int maxdepth;
146 unsigned int tnodes;
147 unsigned int leaves;
148 unsigned int nullpointers;
93672292 149 unsigned int prefixes;
06ef921d 150 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 151};
19baf839
RO
152
153struct trie {
91b9a277 154 struct node *trie;
19baf839
RO
155#ifdef CONFIG_IP_FIB_TRIE_STATS
156 struct trie_use_stats stats;
157#endif
19baf839
RO
158};
159
19baf839 160static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
a07f5f50
SH
161static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
162 int wasfull);
19baf839 163static struct node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
164static struct tnode *inflate(struct trie *t, struct tnode *tn);
165static struct tnode *halve(struct trie *t, struct tnode *tn);
e0f7cb8c
JP
166/* tnodes to free after resize(); protected by RTNL */
167static struct tnode *tnode_free_head;
c3059477
JP
168static size_t tnode_free_size;
169
170/*
171 * synchronize_rcu after call_rcu for that many pages; it should be especially
172 * useful before resizing the root node with PREEMPT_NONE configs; the value was
173 * obtained experimentally, aiming to avoid visible slowdown.
174 */
175static const int sync_pages = 128;
19baf839 176
e18b890b 177static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 178static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 179
06801916
SH
180static inline struct tnode *node_parent(struct node *node)
181{
b59cfbf7
ED
182 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
183}
184
185static inline struct tnode *node_parent_rcu(struct node *node)
186{
187 struct tnode *ret = node_parent(node);
06801916 188
f6b085b6
JP
189 return rcu_dereference_check(ret,
190 rcu_read_lock_held() ||
191 lockdep_rtnl_is_held());
06801916
SH
192}
193
6440cc9e
SH
194/* Same as rcu_assign_pointer
195 * but that macro() assumes that value is a pointer.
196 */
06801916
SH
197static inline void node_set_parent(struct node *node, struct tnode *ptr)
198{
6440cc9e
SH
199 smp_wmb();
200 node->parent = (unsigned long)ptr | NODE_TYPE(node);
06801916 201}
2373ce1c 202
b59cfbf7
ED
203static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
204{
205 BUG_ON(i >= 1U << tn->bits);
2373ce1c 206
b59cfbf7
ED
207 return tn->child[i];
208}
209
210static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
19baf839 211{
b59cfbf7 212 struct node *ret = tnode_get_child(tn, i);
19baf839 213
4eaa0e3c
ED
214 return rcu_dereference_check(ret,
215 rcu_read_lock_held() ||
216 lockdep_rtnl_is_held());
19baf839
RO
217}
218
bb435b8d 219static inline int tnode_child_length(const struct tnode *tn)
19baf839 220{
91b9a277 221 return 1 << tn->bits;
19baf839
RO
222}
223
ab66b4a7
SH
224static inline t_key mask_pfx(t_key k, unsigned short l)
225{
226 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
227}
228
19baf839
RO
229static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
230{
91b9a277 231 if (offset < KEYLENGTH)
19baf839 232 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 233 else
19baf839
RO
234 return 0;
235}
236
237static inline int tkey_equals(t_key a, t_key b)
238{
c877efb2 239 return a == b;
19baf839
RO
240}
241
242static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
243{
c877efb2
SH
244 if (bits == 0 || offset >= KEYLENGTH)
245 return 1;
91b9a277
OJ
246 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
247 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 248}
19baf839
RO
249
250static inline int tkey_mismatch(t_key a, int offset, t_key b)
251{
252 t_key diff = a ^ b;
253 int i = offset;
254
c877efb2
SH
255 if (!diff)
256 return 0;
257 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
258 i++;
259 return i;
260}
261
19baf839 262/*
e905a9ed
YH
263 To understand this stuff, an understanding of keys and all their bits is
264 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
265 all of the bits in that key are significant.
266
267 Consider a node 'n' and its parent 'tp'.
268
e905a9ed
YH
269 If n is a leaf, every bit in its key is significant. Its presence is
270 necessitated by path compression, since during a tree traversal (when
271 searching for a leaf - unless we are doing an insertion) we will completely
272 ignore all skipped bits we encounter. Thus we need to verify, at the end of
273 a potentially successful search, that we have indeed been walking the
19baf839
RO
274 correct key path.
275
e905a9ed
YH
276 Note that we can never "miss" the correct key in the tree if present by
277 following the wrong path. Path compression ensures that segments of the key
278 that are the same for all keys with a given prefix are skipped, but the
279 skipped part *is* identical for each node in the subtrie below the skipped
280 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
281 call to tkey_sub_equals() in trie_insert().
282
e905a9ed 283 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
284 have many different meanings.
285
e905a9ed 286 Example:
19baf839
RO
287 _________________________________________________________________
288 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
289 -----------------------------------------------------------------
e905a9ed 290 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
291
292 _________________________________________________________________
293 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
294 -----------------------------------------------------------------
295 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
296
297 tp->pos = 7
298 tp->bits = 3
299 n->pos = 15
91b9a277 300 n->bits = 4
19baf839 301
e905a9ed
YH
302 First, let's just ignore the bits that come before the parent tp, that is
303 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
304 not use them for anything.
305
306 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 307 index into the parent's child array. That is, they will be used to find
19baf839
RO
308 'n' among tp's children.
309
310 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
311 for the node n.
312
e905a9ed 313 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
314 of the bits are really not needed or indeed known in n->key.
315
e905a9ed 316 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 317 n's child array, and will of course be different for each child.
e905a9ed 318
c877efb2 319
19baf839
RO
320 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
321 at this point.
322
323*/
324
0c7770c7 325static inline void check_tnode(const struct tnode *tn)
19baf839 326{
0c7770c7 327 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
328}
329
f5026fab
DL
330static const int halve_threshold = 25;
331static const int inflate_threshold = 50;
345aa031 332static const int halve_threshold_root = 15;
80b71b80 333static const int inflate_threshold_root = 30;
2373ce1c
RO
334
335static void __alias_free_mem(struct rcu_head *head)
19baf839 336{
2373ce1c
RO
337 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
338 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
339}
340
2373ce1c 341static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 342{
2373ce1c
RO
343 call_rcu(&fa->rcu, __alias_free_mem);
344}
91b9a277 345
2373ce1c
RO
346static void __leaf_free_rcu(struct rcu_head *head)
347{
bc3c8c1e
SH
348 struct leaf *l = container_of(head, struct leaf, rcu);
349 kmem_cache_free(trie_leaf_kmem, l);
2373ce1c 350}
91b9a277 351
387a5487
SH
352static inline void free_leaf(struct leaf *l)
353{
354 call_rcu_bh(&l->rcu, __leaf_free_rcu);
355}
356
2373ce1c 357static void __leaf_info_free_rcu(struct rcu_head *head)
19baf839 358{
2373ce1c 359 kfree(container_of(head, struct leaf_info, rcu));
19baf839
RO
360}
361
2373ce1c 362static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 363{
2373ce1c 364 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
19baf839
RO
365}
366
8d965444 367static struct tnode *tnode_alloc(size_t size)
f0e36f8c 368{
2373ce1c 369 if (size <= PAGE_SIZE)
8d965444 370 return kzalloc(size, GFP_KERNEL);
15be75cd
SH
371 else
372 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
373}
2373ce1c 374
15be75cd
SH
375static void __tnode_vfree(struct work_struct *arg)
376{
377 struct tnode *tn = container_of(arg, struct tnode, work);
378 vfree(tn);
f0e36f8c
PM
379}
380
2373ce1c 381static void __tnode_free_rcu(struct rcu_head *head)
f0e36f8c 382{
2373ce1c 383 struct tnode *tn = container_of(head, struct tnode, rcu);
8d965444
ED
384 size_t size = sizeof(struct tnode) +
385 (sizeof(struct node *) << tn->bits);
f0e36f8c
PM
386
387 if (size <= PAGE_SIZE)
388 kfree(tn);
15be75cd
SH
389 else {
390 INIT_WORK(&tn->work, __tnode_vfree);
391 schedule_work(&tn->work);
392 }
f0e36f8c
PM
393}
394
2373ce1c
RO
395static inline void tnode_free(struct tnode *tn)
396{
387a5487
SH
397 if (IS_LEAF(tn))
398 free_leaf((struct leaf *) tn);
399 else
550e29bc 400 call_rcu(&tn->rcu, __tnode_free_rcu);
2373ce1c
RO
401}
402
e0f7cb8c
JP
403static void tnode_free_safe(struct tnode *tn)
404{
405 BUG_ON(IS_LEAF(tn));
7b85576d
JP
406 tn->tnode_free = tnode_free_head;
407 tnode_free_head = tn;
c3059477
JP
408 tnode_free_size += sizeof(struct tnode) +
409 (sizeof(struct node *) << tn->bits);
e0f7cb8c
JP
410}
411
412static void tnode_free_flush(void)
413{
414 struct tnode *tn;
415
416 while ((tn = tnode_free_head)) {
417 tnode_free_head = tn->tnode_free;
418 tn->tnode_free = NULL;
419 tnode_free(tn);
420 }
c3059477
JP
421
422 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
423 tnode_free_size = 0;
424 synchronize_rcu();
425 }
e0f7cb8c
JP
426}
427
2373ce1c
RO
428static struct leaf *leaf_new(void)
429{
bc3c8c1e 430 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c
RO
431 if (l) {
432 l->parent = T_LEAF;
433 INIT_HLIST_HEAD(&l->list);
434 }
435 return l;
436}
437
438static struct leaf_info *leaf_info_new(int plen)
439{
440 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
441 if (li) {
442 li->plen = plen;
443 INIT_LIST_HEAD(&li->falh);
444 }
445 return li;
446}
447
a07f5f50 448static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 449{
8d965444 450 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
f0e36f8c 451 struct tnode *tn = tnode_alloc(sz);
19baf839 452
91b9a277 453 if (tn) {
2373ce1c 454 tn->parent = T_TNODE;
19baf839
RO
455 tn->pos = pos;
456 tn->bits = bits;
457 tn->key = key;
458 tn->full_children = 0;
459 tn->empty_children = 1<<bits;
460 }
c877efb2 461
8d965444
ED
462 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
463 (unsigned long) (sizeof(struct node) << bits));
19baf839
RO
464 return tn;
465}
466
19baf839
RO
467/*
468 * Check whether a tnode 'n' is "full", i.e. it is an internal node
469 * and no bits are skipped. See discussion in dyntree paper p. 6
470 */
471
bb435b8d 472static inline int tnode_full(const struct tnode *tn, const struct node *n)
19baf839 473{
c877efb2 474 if (n == NULL || IS_LEAF(n))
19baf839
RO
475 return 0;
476
477 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
478}
479
a07f5f50
SH
480static inline void put_child(struct trie *t, struct tnode *tn, int i,
481 struct node *n)
19baf839
RO
482{
483 tnode_put_child_reorg(tn, i, n, -1);
484}
485
c877efb2 486 /*
19baf839
RO
487 * Add a child at position i overwriting the old value.
488 * Update the value of full_children and empty_children.
489 */
490
a07f5f50
SH
491static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
492 int wasfull)
19baf839 493{
2373ce1c 494 struct node *chi = tn->child[i];
19baf839
RO
495 int isfull;
496
0c7770c7
SH
497 BUG_ON(i >= 1<<tn->bits);
498
19baf839
RO
499 /* update emptyChildren */
500 if (n == NULL && chi != NULL)
501 tn->empty_children++;
502 else if (n != NULL && chi == NULL)
503 tn->empty_children--;
c877efb2 504
19baf839 505 /* update fullChildren */
91b9a277 506 if (wasfull == -1)
19baf839
RO
507 wasfull = tnode_full(tn, chi);
508
509 isfull = tnode_full(tn, n);
c877efb2 510 if (wasfull && !isfull)
19baf839 511 tn->full_children--;
c877efb2 512 else if (!wasfull && isfull)
19baf839 513 tn->full_children++;
91b9a277 514
c877efb2 515 if (n)
06801916 516 node_set_parent(n, tn);
19baf839 517
2373ce1c 518 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
519}
520
80b71b80 521#define MAX_WORK 10
c877efb2 522static struct node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
523{
524 int i;
2f80b3c8 525 struct tnode *old_tn;
e6308be8
RO
526 int inflate_threshold_use;
527 int halve_threshold_use;
80b71b80 528 int max_work;
19baf839 529
e905a9ed 530 if (!tn)
19baf839
RO
531 return NULL;
532
0c7770c7
SH
533 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
534 tn, inflate_threshold, halve_threshold);
19baf839
RO
535
536 /* No children */
537 if (tn->empty_children == tnode_child_length(tn)) {
e0f7cb8c 538 tnode_free_safe(tn);
19baf839
RO
539 return NULL;
540 }
541 /* One child */
542 if (tn->empty_children == tnode_child_length(tn) - 1)
80b71b80 543 goto one_child;
c877efb2 544 /*
19baf839
RO
545 * Double as long as the resulting node has a number of
546 * nonempty nodes that are above the threshold.
547 */
548
549 /*
c877efb2
SH
550 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
551 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 552 * Telecommunications, page 6:
c877efb2 553 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
554 * children in the *doubled* node is at least 'high'."
555 *
c877efb2
SH
556 * 'high' in this instance is the variable 'inflate_threshold'. It
557 * is expressed as a percentage, so we multiply it with
558 * tnode_child_length() and instead of multiplying by 2 (since the
559 * child array will be doubled by inflate()) and multiplying
560 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 561 * multiply the left-hand side by 50.
c877efb2
SH
562 *
563 * The left-hand side may look a bit weird: tnode_child_length(tn)
564 * - tn->empty_children is of course the number of non-null children
565 * in the current node. tn->full_children is the number of "full"
19baf839 566 * children, that is non-null tnodes with a skip value of 0.
c877efb2 567 * All of those will be doubled in the resulting inflated tnode, so
19baf839 568 * we just count them one extra time here.
c877efb2 569 *
19baf839 570 * A clearer way to write this would be:
c877efb2 571 *
19baf839 572 * to_be_doubled = tn->full_children;
c877efb2 573 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
574 * tn->full_children;
575 *
576 * new_child_length = tnode_child_length(tn) * 2;
577 *
c877efb2 578 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
579 * new_child_length;
580 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
581 *
582 * ...and so on, tho it would mess up the while () loop.
583 *
19baf839
RO
584 * anyway,
585 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
586 * inflate_threshold
c877efb2 587 *
19baf839
RO
588 * avoid a division:
589 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
590 * inflate_threshold * new_child_length
c877efb2 591 *
19baf839 592 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 593 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 594 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 595 *
19baf839 596 * expand new_child_length:
c877efb2 597 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 598 * tn->full_children) >=
19baf839 599 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 600 *
19baf839 601 * shorten again:
c877efb2 602 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 603 * tn->empty_children) >= inflate_threshold *
19baf839 604 * tnode_child_length(tn)
c877efb2 605 *
19baf839
RO
606 */
607
608 check_tnode(tn);
c877efb2 609
e6308be8
RO
610 /* Keep root node larger */
611
80b71b80
JL
612 if (!node_parent((struct node*) tn)) {
613 inflate_threshold_use = inflate_threshold_root;
614 halve_threshold_use = halve_threshold_root;
615 }
616 else {
e6308be8 617 inflate_threshold_use = inflate_threshold;
80b71b80
JL
618 halve_threshold_use = halve_threshold;
619 }
e6308be8 620
80b71b80
JL
621 max_work = MAX_WORK;
622 while ((tn->full_children > 0 && max_work-- &&
a07f5f50
SH
623 50 * (tn->full_children + tnode_child_length(tn)
624 - tn->empty_children)
625 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 626
2f80b3c8
RO
627 old_tn = tn;
628 tn = inflate(t, tn);
a07f5f50 629
2f80b3c8
RO
630 if (IS_ERR(tn)) {
631 tn = old_tn;
2f36895a
RO
632#ifdef CONFIG_IP_FIB_TRIE_STATS
633 t->stats.resize_node_skipped++;
634#endif
635 break;
636 }
19baf839
RO
637 }
638
639 check_tnode(tn);
640
80b71b80
JL
641 /* Return if at least one inflate is run */
642 if( max_work != MAX_WORK)
643 return (struct node *) tn;
644
19baf839
RO
645 /*
646 * Halve as long as the number of empty children in this
647 * node is above threshold.
648 */
2f36895a 649
80b71b80
JL
650 max_work = MAX_WORK;
651 while (tn->bits > 1 && max_work-- &&
19baf839 652 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 653 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 654
2f80b3c8
RO
655 old_tn = tn;
656 tn = halve(t, tn);
657 if (IS_ERR(tn)) {
658 tn = old_tn;
2f36895a
RO
659#ifdef CONFIG_IP_FIB_TRIE_STATS
660 t->stats.resize_node_skipped++;
661#endif
662 break;
663 }
664 }
19baf839 665
c877efb2 666
19baf839 667 /* Only one child remains */
80b71b80
JL
668 if (tn->empty_children == tnode_child_length(tn) - 1) {
669one_child:
19baf839 670 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 671 struct node *n;
19baf839 672
91b9a277 673 n = tn->child[i];
2373ce1c 674 if (!n)
91b9a277 675 continue;
91b9a277
OJ
676
677 /* compress one level */
678
06801916 679 node_set_parent(n, NULL);
e0f7cb8c 680 tnode_free_safe(tn);
91b9a277 681 return n;
19baf839 682 }
80b71b80 683 }
19baf839
RO
684 return (struct node *) tn;
685}
686
2f80b3c8 687static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839 688{
19baf839
RO
689 struct tnode *oldtnode = tn;
690 int olen = tnode_child_length(tn);
691 int i;
692
0c7770c7 693 pr_debug("In inflate\n");
19baf839
RO
694
695 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
696
0c7770c7 697 if (!tn)
2f80b3c8 698 return ERR_PTR(-ENOMEM);
2f36895a
RO
699
700 /*
c877efb2
SH
701 * Preallocate and store tnodes before the actual work so we
702 * don't get into an inconsistent state if memory allocation
703 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
704 * of tnode is ignored.
705 */
91b9a277
OJ
706
707 for (i = 0; i < olen; i++) {
a07f5f50 708 struct tnode *inode;
2f36895a 709
a07f5f50 710 inode = (struct tnode *) tnode_get_child(oldtnode, i);
2f36895a
RO
711 if (inode &&
712 IS_TNODE(inode) &&
713 inode->pos == oldtnode->pos + oldtnode->bits &&
714 inode->bits > 1) {
715 struct tnode *left, *right;
ab66b4a7 716 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 717
2f36895a
RO
718 left = tnode_new(inode->key&(~m), inode->pos + 1,
719 inode->bits - 1);
2f80b3c8
RO
720 if (!left)
721 goto nomem;
91b9a277 722
2f36895a
RO
723 right = tnode_new(inode->key|m, inode->pos + 1,
724 inode->bits - 1);
725
e905a9ed 726 if (!right) {
2f80b3c8
RO
727 tnode_free(left);
728 goto nomem;
e905a9ed 729 }
2f36895a
RO
730
731 put_child(t, tn, 2*i, (struct node *) left);
732 put_child(t, tn, 2*i+1, (struct node *) right);
733 }
734 }
735
91b9a277 736 for (i = 0; i < olen; i++) {
c95aaf9a 737 struct tnode *inode;
19baf839 738 struct node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
739 struct tnode *left, *right;
740 int size, j;
c877efb2 741
19baf839
RO
742 /* An empty child */
743 if (node == NULL)
744 continue;
745
746 /* A leaf or an internal node with skipped bits */
747
c877efb2 748 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 749 tn->pos + tn->bits - 1) {
a07f5f50
SH
750 if (tkey_extract_bits(node->key,
751 oldtnode->pos + oldtnode->bits,
752 1) == 0)
19baf839
RO
753 put_child(t, tn, 2*i, node);
754 else
755 put_child(t, tn, 2*i+1, node);
756 continue;
757 }
758
759 /* An internal node with two children */
760 inode = (struct tnode *) node;
761
762 if (inode->bits == 1) {
763 put_child(t, tn, 2*i, inode->child[0]);
764 put_child(t, tn, 2*i+1, inode->child[1]);
765
e0f7cb8c 766 tnode_free_safe(inode);
91b9a277 767 continue;
19baf839
RO
768 }
769
91b9a277
OJ
770 /* An internal node with more than two children */
771
772 /* We will replace this node 'inode' with two new
773 * ones, 'left' and 'right', each with half of the
774 * original children. The two new nodes will have
775 * a position one bit further down the key and this
776 * means that the "significant" part of their keys
777 * (see the discussion near the top of this file)
778 * will differ by one bit, which will be "0" in
779 * left's key and "1" in right's key. Since we are
780 * moving the key position by one step, the bit that
781 * we are moving away from - the bit at position
782 * (inode->pos) - is the one that will differ between
783 * left and right. So... we synthesize that bit in the
784 * two new keys.
785 * The mask 'm' below will be a single "one" bit at
786 * the position (inode->pos)
787 */
19baf839 788
91b9a277
OJ
789 /* Use the old key, but set the new significant
790 * bit to zero.
791 */
2f36895a 792
91b9a277
OJ
793 left = (struct tnode *) tnode_get_child(tn, 2*i);
794 put_child(t, tn, 2*i, NULL);
2f36895a 795
91b9a277 796 BUG_ON(!left);
2f36895a 797
91b9a277
OJ
798 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
799 put_child(t, tn, 2*i+1, NULL);
19baf839 800
91b9a277 801 BUG_ON(!right);
19baf839 802
91b9a277
OJ
803 size = tnode_child_length(left);
804 for (j = 0; j < size; j++) {
805 put_child(t, left, j, inode->child[j]);
806 put_child(t, right, j, inode->child[j + size]);
19baf839 807 }
91b9a277
OJ
808 put_child(t, tn, 2*i, resize(t, left));
809 put_child(t, tn, 2*i+1, resize(t, right));
810
e0f7cb8c 811 tnode_free_safe(inode);
19baf839 812 }
e0f7cb8c 813 tnode_free_safe(oldtnode);
19baf839 814 return tn;
2f80b3c8
RO
815nomem:
816 {
817 int size = tnode_child_length(tn);
818 int j;
819
0c7770c7 820 for (j = 0; j < size; j++)
2f80b3c8
RO
821 if (tn->child[j])
822 tnode_free((struct tnode *)tn->child[j]);
823
824 tnode_free(tn);
0c7770c7 825
2f80b3c8
RO
826 return ERR_PTR(-ENOMEM);
827 }
19baf839
RO
828}
829
2f80b3c8 830static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
831{
832 struct tnode *oldtnode = tn;
833 struct node *left, *right;
834 int i;
835 int olen = tnode_child_length(tn);
836
0c7770c7 837 pr_debug("In halve\n");
c877efb2
SH
838
839 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 840
2f80b3c8
RO
841 if (!tn)
842 return ERR_PTR(-ENOMEM);
2f36895a
RO
843
844 /*
c877efb2
SH
845 * Preallocate and store tnodes before the actual work so we
846 * don't get into an inconsistent state if memory allocation
847 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
848 * of tnode is ignored.
849 */
850
91b9a277 851 for (i = 0; i < olen; i += 2) {
2f36895a
RO
852 left = tnode_get_child(oldtnode, i);
853 right = tnode_get_child(oldtnode, i+1);
c877efb2 854
2f36895a 855 /* Two nonempty children */
0c7770c7 856 if (left && right) {
2f80b3c8 857 struct tnode *newn;
0c7770c7 858
2f80b3c8 859 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
860
861 if (!newn)
2f80b3c8 862 goto nomem;
0c7770c7 863
2f80b3c8 864 put_child(t, tn, i/2, (struct node *)newn);
2f36895a 865 }
2f36895a 866
2f36895a 867 }
19baf839 868
91b9a277
OJ
869 for (i = 0; i < olen; i += 2) {
870 struct tnode *newBinNode;
871
19baf839
RO
872 left = tnode_get_child(oldtnode, i);
873 right = tnode_get_child(oldtnode, i+1);
c877efb2 874
19baf839
RO
875 /* At least one of the children is empty */
876 if (left == NULL) {
877 if (right == NULL) /* Both are empty */
878 continue;
879 put_child(t, tn, i/2, right);
91b9a277 880 continue;
0c7770c7 881 }
91b9a277
OJ
882
883 if (right == NULL) {
19baf839 884 put_child(t, tn, i/2, left);
91b9a277
OJ
885 continue;
886 }
c877efb2 887
19baf839 888 /* Two nonempty children */
91b9a277
OJ
889 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
890 put_child(t, tn, i/2, NULL);
91b9a277
OJ
891 put_child(t, newBinNode, 0, left);
892 put_child(t, newBinNode, 1, right);
893 put_child(t, tn, i/2, resize(t, newBinNode));
19baf839 894 }
e0f7cb8c 895 tnode_free_safe(oldtnode);
19baf839 896 return tn;
2f80b3c8
RO
897nomem:
898 {
899 int size = tnode_child_length(tn);
900 int j;
901
0c7770c7 902 for (j = 0; j < size; j++)
2f80b3c8
RO
903 if (tn->child[j])
904 tnode_free((struct tnode *)tn->child[j]);
905
906 tnode_free(tn);
0c7770c7 907
2f80b3c8
RO
908 return ERR_PTR(-ENOMEM);
909 }
19baf839
RO
910}
911
772cb712 912/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
913 via get_fa_head and dump */
914
772cb712 915static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
19baf839 916{
772cb712 917 struct hlist_head *head = &l->list;
19baf839
RO
918 struct hlist_node *node;
919 struct leaf_info *li;
920
2373ce1c 921 hlist_for_each_entry_rcu(li, node, head, hlist)
c877efb2 922 if (li->plen == plen)
19baf839 923 return li;
91b9a277 924
19baf839
RO
925 return NULL;
926}
927
a07f5f50 928static inline struct list_head *get_fa_head(struct leaf *l, int plen)
19baf839 929{
772cb712 930 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 931
91b9a277
OJ
932 if (!li)
933 return NULL;
c877efb2 934
91b9a277 935 return &li->falh;
19baf839
RO
936}
937
938static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
939{
e905a9ed
YH
940 struct leaf_info *li = NULL, *last = NULL;
941 struct hlist_node *node;
942
943 if (hlist_empty(head)) {
944 hlist_add_head_rcu(&new->hlist, head);
945 } else {
946 hlist_for_each_entry(li, node, head, hlist) {
947 if (new->plen > li->plen)
948 break;
949
950 last = li;
951 }
952 if (last)
953 hlist_add_after_rcu(&last->hlist, &new->hlist);
954 else
955 hlist_add_before_rcu(&new->hlist, &li->hlist);
956 }
19baf839
RO
957}
958
2373ce1c
RO
959/* rcu_read_lock needs to be hold by caller from readside */
960
19baf839
RO
961static struct leaf *
962fib_find_node(struct trie *t, u32 key)
963{
964 int pos;
965 struct tnode *tn;
966 struct node *n;
967
968 pos = 0;
634a4b20
PM
969 n = rcu_dereference_check(t->trie,
970 rcu_read_lock_held() ||
971 lockdep_rtnl_is_held());
19baf839
RO
972
973 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
974 tn = (struct tnode *) n;
91b9a277 975
19baf839 976 check_tnode(tn);
91b9a277 977
c877efb2 978 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 979 pos = tn->pos + tn->bits;
a07f5f50
SH
980 n = tnode_get_child_rcu(tn,
981 tkey_extract_bits(key,
982 tn->pos,
983 tn->bits));
91b9a277 984 } else
19baf839
RO
985 break;
986 }
987 /* Case we have found a leaf. Compare prefixes */
988
91b9a277
OJ
989 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
990 return (struct leaf *)n;
991
19baf839
RO
992 return NULL;
993}
994
7b85576d 995static void trie_rebalance(struct trie *t, struct tnode *tn)
19baf839 996{
19baf839 997 int wasfull;
3ed18d76 998 t_key cindex, key;
06801916 999 struct tnode *tp;
19baf839 1000
3ed18d76
RO
1001 key = tn->key;
1002
06801916 1003 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
19baf839
RO
1004 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1005 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
a07f5f50
SH
1006 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1007
1008 tnode_put_child_reorg((struct tnode *)tp, cindex,
1009 (struct node *)tn, wasfull);
91b9a277 1010
06801916 1011 tp = node_parent((struct node *) tn);
008440e3
JP
1012 if (!tp)
1013 rcu_assign_pointer(t->trie, (struct node *)tn);
1014
e0f7cb8c 1015 tnode_free_flush();
06801916 1016 if (!tp)
19baf839 1017 break;
06801916 1018 tn = tp;
19baf839 1019 }
06801916 1020
19baf839 1021 /* Handle last (top) tnode */
7b85576d 1022 if (IS_TNODE(tn))
a07f5f50 1023 tn = (struct tnode *)resize(t, (struct tnode *)tn);
19baf839 1024
7b85576d
JP
1025 rcu_assign_pointer(t->trie, (struct node *)tn);
1026 tnode_free_flush();
19baf839
RO
1027}
1028
2373ce1c
RO
1029/* only used from updater-side */
1030
fea86ad8 1031static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
1032{
1033 int pos, newpos;
1034 struct tnode *tp = NULL, *tn = NULL;
1035 struct node *n;
1036 struct leaf *l;
1037 int missbit;
c877efb2 1038 struct list_head *fa_head = NULL;
19baf839
RO
1039 struct leaf_info *li;
1040 t_key cindex;
1041
1042 pos = 0;
c877efb2 1043 n = t->trie;
19baf839 1044
c877efb2
SH
1045 /* If we point to NULL, stop. Either the tree is empty and we should
1046 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 1047 * and we should just put our new leaf in that.
c877efb2
SH
1048 * If we point to a T_TNODE, check if it matches our key. Note that
1049 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1050 * not be the parent's 'pos'+'bits'!
1051 *
c877efb2 1052 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1053 * the index from our key, push the T_TNODE and walk the tree.
1054 *
1055 * If it doesn't, we have to replace it with a new T_TNODE.
1056 *
c877efb2
SH
1057 * If we point to a T_LEAF, it might or might not have the same key
1058 * as we do. If it does, just change the value, update the T_LEAF's
1059 * value, and return it.
19baf839
RO
1060 * If it doesn't, we need to replace it with a T_TNODE.
1061 */
1062
1063 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1064 tn = (struct tnode *) n;
91b9a277 1065
c877efb2 1066 check_tnode(tn);
91b9a277 1067
c877efb2 1068 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1069 tp = tn;
91b9a277 1070 pos = tn->pos + tn->bits;
a07f5f50
SH
1071 n = tnode_get_child(tn,
1072 tkey_extract_bits(key,
1073 tn->pos,
1074 tn->bits));
19baf839 1075
06801916 1076 BUG_ON(n && node_parent(n) != tn);
91b9a277 1077 } else
19baf839
RO
1078 break;
1079 }
1080
1081 /*
1082 * n ----> NULL, LEAF or TNODE
1083 *
c877efb2 1084 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1085 */
1086
91b9a277 1087 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1088
1089 /* Case 1: n is a leaf. Compare prefixes */
1090
c877efb2 1091 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
c95aaf9a 1092 l = (struct leaf *) n;
19baf839 1093 li = leaf_info_new(plen);
91b9a277 1094
fea86ad8
SH
1095 if (!li)
1096 return NULL;
19baf839
RO
1097
1098 fa_head = &li->falh;
1099 insert_leaf_info(&l->list, li);
1100 goto done;
1101 }
19baf839
RO
1102 l = leaf_new();
1103
fea86ad8
SH
1104 if (!l)
1105 return NULL;
19baf839
RO
1106
1107 l->key = key;
1108 li = leaf_info_new(plen);
1109
c877efb2 1110 if (!li) {
387a5487 1111 free_leaf(l);
fea86ad8 1112 return NULL;
f835e471 1113 }
19baf839
RO
1114
1115 fa_head = &li->falh;
1116 insert_leaf_info(&l->list, li);
1117
19baf839 1118 if (t->trie && n == NULL) {
91b9a277 1119 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1120
06801916 1121 node_set_parent((struct node *)l, tp);
19baf839 1122
91b9a277
OJ
1123 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1124 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1125 } else {
1126 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1127 /*
1128 * Add a new tnode here
19baf839
RO
1129 * first tnode need some special handling
1130 */
1131
1132 if (tp)
91b9a277 1133 pos = tp->pos+tp->bits;
19baf839 1134 else
91b9a277
OJ
1135 pos = 0;
1136
c877efb2 1137 if (n) {
19baf839
RO
1138 newpos = tkey_mismatch(key, pos, n->key);
1139 tn = tnode_new(n->key, newpos, 1);
91b9a277 1140 } else {
19baf839 1141 newpos = 0;
c877efb2 1142 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1143 }
19baf839 1144
c877efb2 1145 if (!tn) {
f835e471 1146 free_leaf_info(li);
387a5487 1147 free_leaf(l);
fea86ad8 1148 return NULL;
91b9a277
OJ
1149 }
1150
06801916 1151 node_set_parent((struct node *)tn, tp);
19baf839 1152
91b9a277 1153 missbit = tkey_extract_bits(key, newpos, 1);
19baf839
RO
1154 put_child(t, tn, missbit, (struct node *)l);
1155 put_child(t, tn, 1-missbit, n);
1156
c877efb2 1157 if (tp) {
19baf839 1158 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
a07f5f50
SH
1159 put_child(t, (struct tnode *)tp, cindex,
1160 (struct node *)tn);
91b9a277 1161 } else {
a07f5f50 1162 rcu_assign_pointer(t->trie, (struct node *)tn);
19baf839
RO
1163 tp = tn;
1164 }
1165 }
91b9a277
OJ
1166
1167 if (tp && tp->pos + tp->bits > 32)
a07f5f50
SH
1168 pr_warning("fib_trie"
1169 " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1170 tp, tp->pos, tp->bits, key, plen);
91b9a277 1171
19baf839 1172 /* Rebalance the trie */
2373ce1c 1173
7b85576d 1174 trie_rebalance(t, tp);
f835e471 1175done:
19baf839
RO
1176 return fa_head;
1177}
1178
d562f1f8
RO
1179/*
1180 * Caller must hold RTNL.
1181 */
16c6cf8b 1182int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1183{
1184 struct trie *t = (struct trie *) tb->tb_data;
1185 struct fib_alias *fa, *new_fa;
c877efb2 1186 struct list_head *fa_head = NULL;
19baf839 1187 struct fib_info *fi;
4e902c57
TG
1188 int plen = cfg->fc_dst_len;
1189 u8 tos = cfg->fc_tos;
19baf839
RO
1190 u32 key, mask;
1191 int err;
1192 struct leaf *l;
1193
1194 if (plen > 32)
1195 return -EINVAL;
1196
4e902c57 1197 key = ntohl(cfg->fc_dst);
19baf839 1198
2dfe55b4 1199 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1200
91b9a277 1201 mask = ntohl(inet_make_mask(plen));
19baf839 1202
c877efb2 1203 if (key & ~mask)
19baf839
RO
1204 return -EINVAL;
1205
1206 key = key & mask;
1207
4e902c57
TG
1208 fi = fib_create_info(cfg);
1209 if (IS_ERR(fi)) {
1210 err = PTR_ERR(fi);
19baf839 1211 goto err;
4e902c57 1212 }
19baf839
RO
1213
1214 l = fib_find_node(t, key);
c877efb2 1215 fa = NULL;
19baf839 1216
c877efb2 1217 if (l) {
19baf839
RO
1218 fa_head = get_fa_head(l, plen);
1219 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1220 }
1221
1222 /* Now fa, if non-NULL, points to the first fib alias
1223 * with the same keys [prefix,tos,priority], if such key already
1224 * exists or to the node before which we will insert new one.
1225 *
1226 * If fa is NULL, we will need to allocate a new one and
1227 * insert to the head of f.
1228 *
1229 * If f is NULL, no fib node matched the destination key
1230 * and we need to allocate a new one of those as well.
1231 */
1232
936f6f8e
JA
1233 if (fa && fa->fa_tos == tos &&
1234 fa->fa_info->fib_priority == fi->fib_priority) {
1235 struct fib_alias *fa_first, *fa_match;
19baf839
RO
1236
1237 err = -EEXIST;
4e902c57 1238 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1239 goto out;
1240
936f6f8e
JA
1241 /* We have 2 goals:
1242 * 1. Find exact match for type, scope, fib_info to avoid
1243 * duplicate routes
1244 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1245 */
1246 fa_match = NULL;
1247 fa_first = fa;
1248 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1249 list_for_each_entry_continue(fa, fa_head, fa_list) {
1250 if (fa->fa_tos != tos)
1251 break;
1252 if (fa->fa_info->fib_priority != fi->fib_priority)
1253 break;
1254 if (fa->fa_type == cfg->fc_type &&
1255 fa->fa_scope == cfg->fc_scope &&
1256 fa->fa_info == fi) {
1257 fa_match = fa;
1258 break;
1259 }
1260 }
1261
4e902c57 1262 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1263 struct fib_info *fi_drop;
1264 u8 state;
1265
936f6f8e
JA
1266 fa = fa_first;
1267 if (fa_match) {
1268 if (fa == fa_match)
1269 err = 0;
6725033f 1270 goto out;
936f6f8e 1271 }
2373ce1c 1272 err = -ENOBUFS;
e94b1766 1273 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1274 if (new_fa == NULL)
1275 goto out;
19baf839
RO
1276
1277 fi_drop = fa->fa_info;
2373ce1c
RO
1278 new_fa->fa_tos = fa->fa_tos;
1279 new_fa->fa_info = fi;
4e902c57
TG
1280 new_fa->fa_type = cfg->fc_type;
1281 new_fa->fa_scope = cfg->fc_scope;
19baf839 1282 state = fa->fa_state;
936f6f8e 1283 new_fa->fa_state = state & ~FA_S_ACCESSED;
19baf839 1284
2373ce1c
RO
1285 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1286 alias_free_mem_rcu(fa);
19baf839
RO
1287
1288 fib_release_info(fi_drop);
1289 if (state & FA_S_ACCESSED)
76e6ebfb 1290 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
b8f55831
MK
1291 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1292 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1293
91b9a277 1294 goto succeeded;
19baf839
RO
1295 }
1296 /* Error if we find a perfect match which
1297 * uses the same scope, type, and nexthop
1298 * information.
1299 */
936f6f8e
JA
1300 if (fa_match)
1301 goto out;
a07f5f50 1302
4e902c57 1303 if (!(cfg->fc_nlflags & NLM_F_APPEND))
936f6f8e 1304 fa = fa_first;
19baf839
RO
1305 }
1306 err = -ENOENT;
4e902c57 1307 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1308 goto out;
1309
1310 err = -ENOBUFS;
e94b1766 1311 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1312 if (new_fa == NULL)
1313 goto out;
1314
1315 new_fa->fa_info = fi;
1316 new_fa->fa_tos = tos;
4e902c57
TG
1317 new_fa->fa_type = cfg->fc_type;
1318 new_fa->fa_scope = cfg->fc_scope;
19baf839 1319 new_fa->fa_state = 0;
19baf839
RO
1320 /*
1321 * Insert new entry to the list.
1322 */
1323
c877efb2 1324 if (!fa_head) {
fea86ad8
SH
1325 fa_head = fib_insert_node(t, key, plen);
1326 if (unlikely(!fa_head)) {
1327 err = -ENOMEM;
f835e471 1328 goto out_free_new_fa;
fea86ad8 1329 }
f835e471 1330 }
19baf839 1331
2373ce1c
RO
1332 list_add_tail_rcu(&new_fa->fa_list,
1333 (fa ? &fa->fa_list : fa_head));
19baf839 1334
76e6ebfb 1335 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
4e902c57 1336 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1337 &cfg->fc_nlinfo, 0);
19baf839
RO
1338succeeded:
1339 return 0;
f835e471
RO
1340
1341out_free_new_fa:
1342 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1343out:
1344 fib_release_info(fi);
91b9a277 1345err:
19baf839
RO
1346 return err;
1347}
1348
772cb712 1349/* should be called with rcu_read_lock */
a07f5f50
SH
1350static int check_leaf(struct trie *t, struct leaf *l,
1351 t_key key, const struct flowi *flp,
1352 struct fib_result *res)
19baf839 1353{
19baf839
RO
1354 struct leaf_info *li;
1355 struct hlist_head *hhead = &l->list;
1356 struct hlist_node *node;
c877efb2 1357
2373ce1c 1358 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
a07f5f50
SH
1359 int err;
1360 int plen = li->plen;
1361 __be32 mask = inet_make_mask(plen);
1362
888454c5 1363 if (l->key != (key & ntohl(mask)))
19baf839
RO
1364 continue;
1365
e204a345 1366 err = fib_semantic_match(&li->falh, flp, res, plen);
a07f5f50 1367
19baf839 1368#ifdef CONFIG_IP_FIB_TRIE_STATS
a07f5f50 1369 if (err <= 0)
19baf839 1370 t->stats.semantic_match_passed++;
a07f5f50
SH
1371 else
1372 t->stats.semantic_match_miss++;
19baf839 1373#endif
a07f5f50 1374 if (err <= 0)
2e655571 1375 return err;
19baf839 1376 }
a07f5f50 1377
2e655571 1378 return 1;
19baf839
RO
1379}
1380
16c6cf8b
SH
1381int fib_table_lookup(struct fib_table *tb, const struct flowi *flp,
1382 struct fib_result *res)
19baf839
RO
1383{
1384 struct trie *t = (struct trie *) tb->tb_data;
2e655571 1385 int ret;
19baf839
RO
1386 struct node *n;
1387 struct tnode *pn;
1388 int pos, bits;
91b9a277 1389 t_key key = ntohl(flp->fl4_dst);
19baf839
RO
1390 int chopped_off;
1391 t_key cindex = 0;
1392 int current_prefix_length = KEYLENGTH;
91b9a277
OJ
1393 struct tnode *cn;
1394 t_key node_prefix, key_prefix, pref_mismatch;
1395 int mp;
1396
2373ce1c 1397 rcu_read_lock();
91b9a277 1398
2373ce1c 1399 n = rcu_dereference(t->trie);
c877efb2 1400 if (!n)
19baf839
RO
1401 goto failed;
1402
1403#ifdef CONFIG_IP_FIB_TRIE_STATS
1404 t->stats.gets++;
1405#endif
1406
1407 /* Just a leaf? */
1408 if (IS_LEAF(n)) {
2e655571 1409 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
a07f5f50 1410 goto found;
19baf839 1411 }
a07f5f50 1412
19baf839
RO
1413 pn = (struct tnode *) n;
1414 chopped_off = 0;
c877efb2 1415
91b9a277 1416 while (pn) {
19baf839
RO
1417 pos = pn->pos;
1418 bits = pn->bits;
1419
c877efb2 1420 if (!chopped_off)
ab66b4a7
SH
1421 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1422 pos, bits);
19baf839 1423
b902e573 1424 n = tnode_get_child_rcu(pn, cindex);
19baf839
RO
1425
1426 if (n == NULL) {
1427#ifdef CONFIG_IP_FIB_TRIE_STATS
1428 t->stats.null_node_hit++;
1429#endif
1430 goto backtrace;
1431 }
1432
91b9a277 1433 if (IS_LEAF(n)) {
2e655571
BH
1434 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1435 if (ret > 0)
91b9a277 1436 goto backtrace;
a07f5f50 1437 goto found;
91b9a277
OJ
1438 }
1439
91b9a277 1440 cn = (struct tnode *)n;
19baf839 1441
91b9a277
OJ
1442 /*
1443 * It's a tnode, and we can do some extra checks here if we
1444 * like, to avoid descending into a dead-end branch.
1445 * This tnode is in the parent's child array at index
1446 * key[p_pos..p_pos+p_bits] but potentially with some bits
1447 * chopped off, so in reality the index may be just a
1448 * subprefix, padded with zero at the end.
1449 * We can also take a look at any skipped bits in this
1450 * tnode - everything up to p_pos is supposed to be ok,
1451 * and the non-chopped bits of the index (se previous
1452 * paragraph) are also guaranteed ok, but the rest is
1453 * considered unknown.
1454 *
1455 * The skipped bits are key[pos+bits..cn->pos].
1456 */
19baf839 1457
91b9a277
OJ
1458 /* If current_prefix_length < pos+bits, we are already doing
1459 * actual prefix matching, which means everything from
1460 * pos+(bits-chopped_off) onward must be zero along some
1461 * branch of this subtree - otherwise there is *no* valid
1462 * prefix present. Here we can only check the skipped
1463 * bits. Remember, since we have already indexed into the
1464 * parent's child array, we know that the bits we chopped of
1465 * *are* zero.
1466 */
19baf839 1467
a07f5f50
SH
1468 /* NOTA BENE: Checking only skipped bits
1469 for the new node here */
19baf839 1470
91b9a277
OJ
1471 if (current_prefix_length < pos+bits) {
1472 if (tkey_extract_bits(cn->key, current_prefix_length,
a07f5f50
SH
1473 cn->pos - current_prefix_length)
1474 || !(cn->child[0]))
91b9a277
OJ
1475 goto backtrace;
1476 }
19baf839 1477
91b9a277
OJ
1478 /*
1479 * If chopped_off=0, the index is fully validated and we
1480 * only need to look at the skipped bits for this, the new,
1481 * tnode. What we actually want to do is to find out if
1482 * these skipped bits match our key perfectly, or if we will
1483 * have to count on finding a matching prefix further down,
1484 * because if we do, we would like to have some way of
1485 * verifying the existence of such a prefix at this point.
1486 */
19baf839 1487
91b9a277
OJ
1488 /* The only thing we can do at this point is to verify that
1489 * any such matching prefix can indeed be a prefix to our
1490 * key, and if the bits in the node we are inspecting that
1491 * do not match our key are not ZERO, this cannot be true.
1492 * Thus, find out where there is a mismatch (before cn->pos)
1493 * and verify that all the mismatching bits are zero in the
1494 * new tnode's key.
1495 */
19baf839 1496
a07f5f50
SH
1497 /*
1498 * Note: We aren't very concerned about the piece of
1499 * the key that precede pn->pos+pn->bits, since these
1500 * have already been checked. The bits after cn->pos
1501 * aren't checked since these are by definition
1502 * "unknown" at this point. Thus, what we want to see
1503 * is if we are about to enter the "prefix matching"
1504 * state, and in that case verify that the skipped
1505 * bits that will prevail throughout this subtree are
1506 * zero, as they have to be if we are to find a
1507 * matching prefix.
91b9a277
OJ
1508 */
1509
ab66b4a7
SH
1510 node_prefix = mask_pfx(cn->key, cn->pos);
1511 key_prefix = mask_pfx(key, cn->pos);
91b9a277
OJ
1512 pref_mismatch = key_prefix^node_prefix;
1513 mp = 0;
1514
a07f5f50
SH
1515 /*
1516 * In short: If skipped bits in this node do not match
1517 * the search key, enter the "prefix matching"
1518 * state.directly.
91b9a277
OJ
1519 */
1520 if (pref_mismatch) {
1521 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1522 mp++;
a07f5f50 1523 pref_mismatch = pref_mismatch << 1;
91b9a277
OJ
1524 }
1525 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1526
1527 if (key_prefix != 0)
1528 goto backtrace;
1529
1530 if (current_prefix_length >= cn->pos)
1531 current_prefix_length = mp;
c877efb2 1532 }
a07f5f50 1533
91b9a277
OJ
1534 pn = (struct tnode *)n; /* Descend */
1535 chopped_off = 0;
1536 continue;
1537
19baf839
RO
1538backtrace:
1539 chopped_off++;
1540
1541 /* As zero don't change the child key (cindex) */
a07f5f50
SH
1542 while ((chopped_off <= pn->bits)
1543 && !(cindex & (1<<(chopped_off-1))))
19baf839 1544 chopped_off++;
19baf839
RO
1545
1546 /* Decrease current_... with bits chopped off */
1547 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
a07f5f50
SH
1548 current_prefix_length = pn->pos + pn->bits
1549 - chopped_off;
91b9a277 1550
19baf839 1551 /*
c877efb2 1552 * Either we do the actual chop off according or if we have
19baf839
RO
1553 * chopped off all bits in this tnode walk up to our parent.
1554 */
1555
91b9a277 1556 if (chopped_off <= pn->bits) {
19baf839 1557 cindex &= ~(1 << (chopped_off-1));
91b9a277 1558 } else {
b902e573 1559 struct tnode *parent = node_parent_rcu((struct node *) pn);
06801916 1560 if (!parent)
19baf839 1561 goto failed;
91b9a277 1562
19baf839 1563 /* Get Child's index */
06801916
SH
1564 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1565 pn = parent;
19baf839
RO
1566 chopped_off = 0;
1567
1568#ifdef CONFIG_IP_FIB_TRIE_STATS
1569 t->stats.backtrack++;
1570#endif
1571 goto backtrace;
c877efb2 1572 }
19baf839
RO
1573 }
1574failed:
c877efb2 1575 ret = 1;
19baf839 1576found:
2373ce1c 1577 rcu_read_unlock();
19baf839
RO
1578 return ret;
1579}
1580
9195bef7
SH
1581/*
1582 * Remove the leaf and return parent.
1583 */
1584static void trie_leaf_remove(struct trie *t, struct leaf *l)
19baf839 1585{
9195bef7 1586 struct tnode *tp = node_parent((struct node *) l);
c877efb2 1587
9195bef7 1588 pr_debug("entering trie_leaf_remove(%p)\n", l);
19baf839 1589
c877efb2 1590 if (tp) {
9195bef7 1591 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
19baf839 1592 put_child(t, (struct tnode *)tp, cindex, NULL);
7b85576d 1593 trie_rebalance(t, tp);
91b9a277 1594 } else
2373ce1c 1595 rcu_assign_pointer(t->trie, NULL);
19baf839 1596
387a5487 1597 free_leaf(l);
19baf839
RO
1598}
1599
d562f1f8
RO
1600/*
1601 * Caller must hold RTNL.
1602 */
16c6cf8b 1603int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1604{
1605 struct trie *t = (struct trie *) tb->tb_data;
1606 u32 key, mask;
4e902c57
TG
1607 int plen = cfg->fc_dst_len;
1608 u8 tos = cfg->fc_tos;
19baf839
RO
1609 struct fib_alias *fa, *fa_to_delete;
1610 struct list_head *fa_head;
1611 struct leaf *l;
91b9a277
OJ
1612 struct leaf_info *li;
1613
c877efb2 1614 if (plen > 32)
19baf839
RO
1615 return -EINVAL;
1616
4e902c57 1617 key = ntohl(cfg->fc_dst);
91b9a277 1618 mask = ntohl(inet_make_mask(plen));
19baf839 1619
c877efb2 1620 if (key & ~mask)
19baf839
RO
1621 return -EINVAL;
1622
1623 key = key & mask;
1624 l = fib_find_node(t, key);
1625
c877efb2 1626 if (!l)
19baf839
RO
1627 return -ESRCH;
1628
1629 fa_head = get_fa_head(l, plen);
1630 fa = fib_find_alias(fa_head, tos, 0);
1631
1632 if (!fa)
1633 return -ESRCH;
1634
0c7770c7 1635 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1636
1637 fa_to_delete = NULL;
936f6f8e
JA
1638 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1639 list_for_each_entry_continue(fa, fa_head, fa_list) {
19baf839
RO
1640 struct fib_info *fi = fa->fa_info;
1641
1642 if (fa->fa_tos != tos)
1643 break;
1644
4e902c57
TG
1645 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1646 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1647 fa->fa_scope == cfg->fc_scope) &&
1648 (!cfg->fc_protocol ||
1649 fi->fib_protocol == cfg->fc_protocol) &&
1650 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1651 fa_to_delete = fa;
1652 break;
1653 }
1654 }
1655
91b9a277
OJ
1656 if (!fa_to_delete)
1657 return -ESRCH;
19baf839 1658
91b9a277 1659 fa = fa_to_delete;
4e902c57 1660 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1661 &cfg->fc_nlinfo, 0);
91b9a277
OJ
1662
1663 l = fib_find_node(t, key);
772cb712 1664 li = find_leaf_info(l, plen);
19baf839 1665
2373ce1c 1666 list_del_rcu(&fa->fa_list);
19baf839 1667
91b9a277 1668 if (list_empty(fa_head)) {
2373ce1c 1669 hlist_del_rcu(&li->hlist);
91b9a277 1670 free_leaf_info(li);
2373ce1c 1671 }
19baf839 1672
91b9a277 1673 if (hlist_empty(&l->list))
9195bef7 1674 trie_leaf_remove(t, l);
19baf839 1675
91b9a277 1676 if (fa->fa_state & FA_S_ACCESSED)
76e6ebfb 1677 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
19baf839 1678
2373ce1c
RO
1679 fib_release_info(fa->fa_info);
1680 alias_free_mem_rcu(fa);
91b9a277 1681 return 0;
19baf839
RO
1682}
1683
ef3660ce 1684static int trie_flush_list(struct list_head *head)
19baf839
RO
1685{
1686 struct fib_alias *fa, *fa_node;
1687 int found = 0;
1688
1689 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1690 struct fib_info *fi = fa->fa_info;
19baf839 1691
2373ce1c
RO
1692 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1693 list_del_rcu(&fa->fa_list);
1694 fib_release_info(fa->fa_info);
1695 alias_free_mem_rcu(fa);
19baf839
RO
1696 found++;
1697 }
1698 }
1699 return found;
1700}
1701
ef3660ce 1702static int trie_flush_leaf(struct leaf *l)
19baf839
RO
1703{
1704 int found = 0;
1705 struct hlist_head *lih = &l->list;
1706 struct hlist_node *node, *tmp;
1707 struct leaf_info *li = NULL;
1708
1709 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
ef3660ce 1710 found += trie_flush_list(&li->falh);
19baf839
RO
1711
1712 if (list_empty(&li->falh)) {
2373ce1c 1713 hlist_del_rcu(&li->hlist);
19baf839
RO
1714 free_leaf_info(li);
1715 }
1716 }
1717 return found;
1718}
1719
82cfbb00
SH
1720/*
1721 * Scan for the next right leaf starting at node p->child[idx]
1722 * Since we have back pointer, no recursion necessary.
1723 */
1724static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
19baf839 1725{
82cfbb00
SH
1726 do {
1727 t_key idx;
c877efb2 1728
c877efb2 1729 if (c)
82cfbb00 1730 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
c877efb2 1731 else
82cfbb00 1732 idx = 0;
2373ce1c 1733
82cfbb00
SH
1734 while (idx < 1u << p->bits) {
1735 c = tnode_get_child_rcu(p, idx++);
2373ce1c 1736 if (!c)
91b9a277
OJ
1737 continue;
1738
82cfbb00
SH
1739 if (IS_LEAF(c)) {
1740 prefetch(p->child[idx]);
1741 return (struct leaf *) c;
19baf839 1742 }
82cfbb00
SH
1743
1744 /* Rescan start scanning in new node */
1745 p = (struct tnode *) c;
1746 idx = 0;
19baf839 1747 }
82cfbb00
SH
1748
1749 /* Node empty, walk back up to parent */
91b9a277 1750 c = (struct node *) p;
82cfbb00
SH
1751 } while ( (p = node_parent_rcu(c)) != NULL);
1752
1753 return NULL; /* Root of trie */
1754}
1755
82cfbb00
SH
1756static struct leaf *trie_firstleaf(struct trie *t)
1757{
f6b085b6
JP
1758 struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie,
1759 rcu_read_lock_held() ||
1760 lockdep_rtnl_is_held());
82cfbb00
SH
1761
1762 if (!n)
1763 return NULL;
1764
1765 if (IS_LEAF(n)) /* trie is just a leaf */
1766 return (struct leaf *) n;
1767
1768 return leaf_walk_rcu(n, NULL);
1769}
1770
1771static struct leaf *trie_nextleaf(struct leaf *l)
1772{
1773 struct node *c = (struct node *) l;
b902e573 1774 struct tnode *p = node_parent_rcu(c);
82cfbb00
SH
1775
1776 if (!p)
1777 return NULL; /* trie with just one leaf */
1778
1779 return leaf_walk_rcu(p, c);
19baf839
RO
1780}
1781
71d67e66
SH
1782static struct leaf *trie_leafindex(struct trie *t, int index)
1783{
1784 struct leaf *l = trie_firstleaf(t);
1785
ec28cf73 1786 while (l && index-- > 0)
71d67e66 1787 l = trie_nextleaf(l);
ec28cf73 1788
71d67e66
SH
1789 return l;
1790}
1791
1792
d562f1f8
RO
1793/*
1794 * Caller must hold RTNL.
1795 */
16c6cf8b 1796int fib_table_flush(struct fib_table *tb)
19baf839
RO
1797{
1798 struct trie *t = (struct trie *) tb->tb_data;
9195bef7 1799 struct leaf *l, *ll = NULL;
82cfbb00 1800 int found = 0;
19baf839 1801
82cfbb00 1802 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
ef3660ce 1803 found += trie_flush_leaf(l);
19baf839
RO
1804
1805 if (ll && hlist_empty(&ll->list))
9195bef7 1806 trie_leaf_remove(t, ll);
19baf839
RO
1807 ll = l;
1808 }
1809
1810 if (ll && hlist_empty(&ll->list))
9195bef7 1811 trie_leaf_remove(t, ll);
19baf839 1812
0c7770c7 1813 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1814 return found;
1815}
1816
16c6cf8b
SH
1817void fib_table_select_default(struct fib_table *tb,
1818 const struct flowi *flp,
1819 struct fib_result *res)
19baf839
RO
1820{
1821 struct trie *t = (struct trie *) tb->tb_data;
1822 int order, last_idx;
1823 struct fib_info *fi = NULL;
1824 struct fib_info *last_resort;
1825 struct fib_alias *fa = NULL;
1826 struct list_head *fa_head;
1827 struct leaf *l;
1828
1829 last_idx = -1;
1830 last_resort = NULL;
1831 order = -1;
1832
2373ce1c 1833 rcu_read_lock();
c877efb2 1834
19baf839 1835 l = fib_find_node(t, 0);
c877efb2 1836 if (!l)
19baf839
RO
1837 goto out;
1838
1839 fa_head = get_fa_head(l, 0);
c877efb2 1840 if (!fa_head)
19baf839
RO
1841 goto out;
1842
c877efb2 1843 if (list_empty(fa_head))
19baf839
RO
1844 goto out;
1845
2373ce1c 1846 list_for_each_entry_rcu(fa, fa_head, fa_list) {
19baf839 1847 struct fib_info *next_fi = fa->fa_info;
91b9a277 1848
19baf839
RO
1849 if (fa->fa_scope != res->scope ||
1850 fa->fa_type != RTN_UNICAST)
1851 continue;
91b9a277 1852
19baf839
RO
1853 if (next_fi->fib_priority > res->fi->fib_priority)
1854 break;
1855 if (!next_fi->fib_nh[0].nh_gw ||
1856 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1857 continue;
1858 fa->fa_state |= FA_S_ACCESSED;
91b9a277 1859
19baf839
RO
1860 if (fi == NULL) {
1861 if (next_fi != res->fi)
1862 break;
1863 } else if (!fib_detect_death(fi, order, &last_resort,
971b893e 1864 &last_idx, tb->tb_default)) {
a2bbe682 1865 fib_result_assign(res, fi);
971b893e 1866 tb->tb_default = order;
19baf839
RO
1867 goto out;
1868 }
1869 fi = next_fi;
1870 order++;
1871 }
1872 if (order <= 0 || fi == NULL) {
971b893e 1873 tb->tb_default = -1;
19baf839
RO
1874 goto out;
1875 }
1876
971b893e
DL
1877 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1878 tb->tb_default)) {
a2bbe682 1879 fib_result_assign(res, fi);
971b893e 1880 tb->tb_default = order;
19baf839
RO
1881 goto out;
1882 }
a2bbe682
DL
1883 if (last_idx >= 0)
1884 fib_result_assign(res, last_resort);
971b893e
DL
1885 tb->tb_default = last_idx;
1886out:
2373ce1c 1887 rcu_read_unlock();
19baf839
RO
1888}
1889
a07f5f50
SH
1890static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1891 struct fib_table *tb,
19baf839
RO
1892 struct sk_buff *skb, struct netlink_callback *cb)
1893{
1894 int i, s_i;
1895 struct fib_alias *fa;
32ab5f80 1896 __be32 xkey = htonl(key);
19baf839 1897
71d67e66 1898 s_i = cb->args[5];
19baf839
RO
1899 i = 0;
1900
2373ce1c
RO
1901 /* rcu_read_lock is hold by caller */
1902
1903 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1904 if (i < s_i) {
1905 i++;
1906 continue;
1907 }
19baf839
RO
1908
1909 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1910 cb->nlh->nlmsg_seq,
1911 RTM_NEWROUTE,
1912 tb->tb_id,
1913 fa->fa_type,
1914 fa->fa_scope,
be403ea1 1915 xkey,
19baf839
RO
1916 plen,
1917 fa->fa_tos,
64347f78 1918 fa->fa_info, NLM_F_MULTI) < 0) {
71d67e66 1919 cb->args[5] = i;
19baf839 1920 return -1;
91b9a277 1921 }
19baf839
RO
1922 i++;
1923 }
71d67e66 1924 cb->args[5] = i;
19baf839
RO
1925 return skb->len;
1926}
1927
a88ee229
SH
1928static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1929 struct sk_buff *skb, struct netlink_callback *cb)
19baf839 1930{
a88ee229
SH
1931 struct leaf_info *li;
1932 struct hlist_node *node;
1933 int i, s_i;
19baf839 1934
71d67e66 1935 s_i = cb->args[4];
a88ee229 1936 i = 0;
19baf839 1937
a88ee229
SH
1938 /* rcu_read_lock is hold by caller */
1939 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1940 if (i < s_i) {
1941 i++;
19baf839 1942 continue;
a88ee229 1943 }
91b9a277 1944
a88ee229 1945 if (i > s_i)
71d67e66 1946 cb->args[5] = 0;
19baf839 1947
a88ee229 1948 if (list_empty(&li->falh))
19baf839
RO
1949 continue;
1950
a88ee229 1951 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
71d67e66 1952 cb->args[4] = i;
19baf839
RO
1953 return -1;
1954 }
a88ee229 1955 i++;
19baf839 1956 }
a88ee229 1957
71d67e66 1958 cb->args[4] = i;
19baf839
RO
1959 return skb->len;
1960}
1961
16c6cf8b
SH
1962int fib_table_dump(struct fib_table *tb, struct sk_buff *skb,
1963 struct netlink_callback *cb)
19baf839 1964{
a88ee229 1965 struct leaf *l;
19baf839 1966 struct trie *t = (struct trie *) tb->tb_data;
d5ce8a0e 1967 t_key key = cb->args[2];
71d67e66 1968 int count = cb->args[3];
19baf839 1969
2373ce1c 1970 rcu_read_lock();
d5ce8a0e
SH
1971 /* Dump starting at last key.
1972 * Note: 0.0.0.0/0 (ie default) is first key.
1973 */
71d67e66 1974 if (count == 0)
d5ce8a0e
SH
1975 l = trie_firstleaf(t);
1976 else {
71d67e66
SH
1977 /* Normally, continue from last key, but if that is missing
1978 * fallback to using slow rescan
1979 */
d5ce8a0e 1980 l = fib_find_node(t, key);
71d67e66
SH
1981 if (!l)
1982 l = trie_leafindex(t, count);
d5ce8a0e 1983 }
a88ee229 1984
d5ce8a0e
SH
1985 while (l) {
1986 cb->args[2] = l->key;
a88ee229 1987 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
71d67e66 1988 cb->args[3] = count;
a88ee229 1989 rcu_read_unlock();
a88ee229 1990 return -1;
19baf839 1991 }
d5ce8a0e 1992
71d67e66 1993 ++count;
d5ce8a0e 1994 l = trie_nextleaf(l);
71d67e66
SH
1995 memset(&cb->args[4], 0,
1996 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839 1997 }
71d67e66 1998 cb->args[3] = count;
2373ce1c 1999 rcu_read_unlock();
a88ee229 2000
19baf839 2001 return skb->len;
19baf839
RO
2002}
2003
7f9b8052
SH
2004void __init fib_hash_init(void)
2005{
a07f5f50
SH
2006 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2007 sizeof(struct fib_alias),
bc3c8c1e
SH
2008 0, SLAB_PANIC, NULL);
2009
2010 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2011 max(sizeof(struct leaf),
2012 sizeof(struct leaf_info)),
2013 0, SLAB_PANIC, NULL);
7f9b8052 2014}
19baf839 2015
7f9b8052
SH
2016
2017/* Fix more generic FIB names for init later */
2018struct fib_table *fib_hash_table(u32 id)
19baf839
RO
2019{
2020 struct fib_table *tb;
2021 struct trie *t;
2022
19baf839
RO
2023 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2024 GFP_KERNEL);
2025 if (tb == NULL)
2026 return NULL;
2027
2028 tb->tb_id = id;
971b893e 2029 tb->tb_default = -1;
19baf839
RO
2030
2031 t = (struct trie *) tb->tb_data;
c28a1cf4 2032 memset(t, 0, sizeof(*t));
19baf839 2033
19baf839 2034 if (id == RT_TABLE_LOCAL)
a07f5f50 2035 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
19baf839
RO
2036
2037 return tb;
2038}
2039
cb7b593c
SH
2040#ifdef CONFIG_PROC_FS
2041/* Depth first Trie walk iterator */
2042struct fib_trie_iter {
1c340b2f 2043 struct seq_net_private p;
3d3b2d25 2044 struct fib_table *tb;
cb7b593c 2045 struct tnode *tnode;
cb7b593c
SH
2046 unsigned index;
2047 unsigned depth;
2048};
19baf839 2049
cb7b593c 2050static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 2051{
cb7b593c
SH
2052 struct tnode *tn = iter->tnode;
2053 unsigned cindex = iter->index;
2054 struct tnode *p;
19baf839 2055
6640e697
EB
2056 /* A single entry routing table */
2057 if (!tn)
2058 return NULL;
2059
cb7b593c
SH
2060 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2061 iter->tnode, iter->index, iter->depth);
2062rescan:
2063 while (cindex < (1<<tn->bits)) {
b59cfbf7 2064 struct node *n = tnode_get_child_rcu(tn, cindex);
19baf839 2065
cb7b593c
SH
2066 if (n) {
2067 if (IS_LEAF(n)) {
2068 iter->tnode = tn;
2069 iter->index = cindex + 1;
2070 } else {
2071 /* push down one level */
2072 iter->tnode = (struct tnode *) n;
2073 iter->index = 0;
2074 ++iter->depth;
2075 }
2076 return n;
2077 }
19baf839 2078
cb7b593c
SH
2079 ++cindex;
2080 }
91b9a277 2081
cb7b593c 2082 /* Current node exhausted, pop back up */
b59cfbf7 2083 p = node_parent_rcu((struct node *)tn);
cb7b593c
SH
2084 if (p) {
2085 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2086 tn = p;
2087 --iter->depth;
2088 goto rescan;
19baf839 2089 }
cb7b593c
SH
2090
2091 /* got root? */
2092 return NULL;
19baf839
RO
2093}
2094
cb7b593c
SH
2095static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2096 struct trie *t)
19baf839 2097{
3d3b2d25 2098 struct node *n;
5ddf0eb2 2099
132adf54 2100 if (!t)
5ddf0eb2
RO
2101 return NULL;
2102
2103 n = rcu_dereference(t->trie);
3d3b2d25 2104 if (!n)
5ddf0eb2 2105 return NULL;
19baf839 2106
3d3b2d25
SH
2107 if (IS_TNODE(n)) {
2108 iter->tnode = (struct tnode *) n;
2109 iter->index = 0;
2110 iter->depth = 1;
2111 } else {
2112 iter->tnode = NULL;
2113 iter->index = 0;
2114 iter->depth = 0;
91b9a277 2115 }
3d3b2d25
SH
2116
2117 return n;
cb7b593c 2118}
91b9a277 2119
cb7b593c
SH
2120static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2121{
2122 struct node *n;
2123 struct fib_trie_iter iter;
91b9a277 2124
cb7b593c 2125 memset(s, 0, sizeof(*s));
91b9a277 2126
cb7b593c 2127 rcu_read_lock();
3d3b2d25 2128 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
cb7b593c 2129 if (IS_LEAF(n)) {
93672292
SH
2130 struct leaf *l = (struct leaf *)n;
2131 struct leaf_info *li;
2132 struct hlist_node *tmp;
2133
cb7b593c
SH
2134 s->leaves++;
2135 s->totdepth += iter.depth;
2136 if (iter.depth > s->maxdepth)
2137 s->maxdepth = iter.depth;
93672292
SH
2138
2139 hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2140 ++s->prefixes;
cb7b593c
SH
2141 } else {
2142 const struct tnode *tn = (const struct tnode *) n;
2143 int i;
2144
2145 s->tnodes++;
132adf54 2146 if (tn->bits < MAX_STAT_DEPTH)
06ef921d
RO
2147 s->nodesizes[tn->bits]++;
2148
cb7b593c
SH
2149 for (i = 0; i < (1<<tn->bits); i++)
2150 if (!tn->child[i])
2151 s->nullpointers++;
19baf839 2152 }
19baf839 2153 }
2373ce1c 2154 rcu_read_unlock();
19baf839
RO
2155}
2156
cb7b593c
SH
2157/*
2158 * This outputs /proc/net/fib_triestats
2159 */
2160static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2161{
cb7b593c 2162 unsigned i, max, pointers, bytes, avdepth;
c877efb2 2163
cb7b593c
SH
2164 if (stat->leaves)
2165 avdepth = stat->totdepth*100 / stat->leaves;
2166 else
2167 avdepth = 0;
91b9a277 2168
a07f5f50
SH
2169 seq_printf(seq, "\tAver depth: %u.%02d\n",
2170 avdepth / 100, avdepth % 100);
cb7b593c 2171 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2172
cb7b593c 2173 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
cb7b593c 2174 bytes = sizeof(struct leaf) * stat->leaves;
93672292
SH
2175
2176 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2177 bytes += sizeof(struct leaf_info) * stat->prefixes;
2178
187b5188 2179 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 2180 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2181
06ef921d
RO
2182 max = MAX_STAT_DEPTH;
2183 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2184 max--;
19baf839 2185
cb7b593c
SH
2186 pointers = 0;
2187 for (i = 1; i <= max; i++)
2188 if (stat->nodesizes[i] != 0) {
187b5188 2189 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2190 pointers += (1<<i) * stat->nodesizes[i];
2191 }
2192 seq_putc(seq, '\n');
187b5188 2193 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2194
cb7b593c 2195 bytes += sizeof(struct node *) * pointers;
187b5188
SH
2196 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2197 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2198}
2373ce1c 2199
cb7b593c 2200#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd
SH
2201static void trie_show_usage(struct seq_file *seq,
2202 const struct trie_use_stats *stats)
2203{
2204 seq_printf(seq, "\nCounters:\n---------\n");
a07f5f50
SH
2205 seq_printf(seq, "gets = %u\n", stats->gets);
2206 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2207 seq_printf(seq, "semantic match passed = %u\n",
2208 stats->semantic_match_passed);
2209 seq_printf(seq, "semantic match miss = %u\n",
2210 stats->semantic_match_miss);
2211 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2212 seq_printf(seq, "skipped node resize = %u\n\n",
2213 stats->resize_node_skipped);
cb7b593c 2214}
66a2f7fd
SH
2215#endif /* CONFIG_IP_FIB_TRIE_STATS */
2216
3d3b2d25 2217static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
d717a9a6 2218{
3d3b2d25
SH
2219 if (tb->tb_id == RT_TABLE_LOCAL)
2220 seq_puts(seq, "Local:\n");
2221 else if (tb->tb_id == RT_TABLE_MAIN)
2222 seq_puts(seq, "Main:\n");
2223 else
2224 seq_printf(seq, "Id %d:\n", tb->tb_id);
d717a9a6 2225}
19baf839 2226
3d3b2d25 2227
cb7b593c
SH
2228static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2229{
1c340b2f 2230 struct net *net = (struct net *)seq->private;
3d3b2d25 2231 unsigned int h;
877a9bff 2232
d717a9a6 2233 seq_printf(seq,
a07f5f50
SH
2234 "Basic info: size of leaf:"
2235 " %Zd bytes, size of tnode: %Zd bytes.\n",
d717a9a6
SH
2236 sizeof(struct leaf), sizeof(struct tnode));
2237
3d3b2d25
SH
2238 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2239 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2240 struct hlist_node *node;
2241 struct fib_table *tb;
2242
2243 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2244 struct trie *t = (struct trie *) tb->tb_data;
2245 struct trie_stat stat;
877a9bff 2246
3d3b2d25
SH
2247 if (!t)
2248 continue;
2249
2250 fib_table_print(seq, tb);
2251
2252 trie_collect_stats(t, &stat);
2253 trie_show_stats(seq, &stat);
2254#ifdef CONFIG_IP_FIB_TRIE_STATS
2255 trie_show_usage(seq, &t->stats);
2256#endif
2257 }
2258 }
19baf839 2259
cb7b593c 2260 return 0;
19baf839
RO
2261}
2262
cb7b593c 2263static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2264{
de05c557 2265 return single_open_net(inode, file, fib_triestat_seq_show);
1c340b2f
DL
2266}
2267
9a32144e 2268static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2269 .owner = THIS_MODULE,
2270 .open = fib_triestat_seq_open,
2271 .read = seq_read,
2272 .llseek = seq_lseek,
b6fcbdb4 2273 .release = single_release_net,
cb7b593c
SH
2274};
2275
1218854a 2276static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
19baf839 2277{
1218854a
YH
2278 struct fib_trie_iter *iter = seq->private;
2279 struct net *net = seq_file_net(seq);
cb7b593c 2280 loff_t idx = 0;
3d3b2d25 2281 unsigned int h;
cb7b593c 2282
3d3b2d25
SH
2283 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2284 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2285 struct hlist_node *node;
2286 struct fib_table *tb;
cb7b593c 2287
3d3b2d25
SH
2288 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2289 struct node *n;
2290
2291 for (n = fib_trie_get_first(iter,
2292 (struct trie *) tb->tb_data);
2293 n; n = fib_trie_get_next(iter))
2294 if (pos == idx++) {
2295 iter->tb = tb;
2296 return n;
2297 }
2298 }
cb7b593c 2299 }
3d3b2d25 2300
19baf839
RO
2301 return NULL;
2302}
2303
cb7b593c 2304static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2305 __acquires(RCU)
19baf839 2306{
cb7b593c 2307 rcu_read_lock();
1218854a 2308 return fib_trie_get_idx(seq, *pos);
19baf839
RO
2309}
2310
cb7b593c 2311static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2312{
cb7b593c 2313 struct fib_trie_iter *iter = seq->private;
1218854a 2314 struct net *net = seq_file_net(seq);
3d3b2d25
SH
2315 struct fib_table *tb = iter->tb;
2316 struct hlist_node *tb_node;
2317 unsigned int h;
2318 struct node *n;
cb7b593c 2319
19baf839 2320 ++*pos;
3d3b2d25
SH
2321 /* next node in same table */
2322 n = fib_trie_get_next(iter);
2323 if (n)
2324 return n;
19baf839 2325
3d3b2d25
SH
2326 /* walk rest of this hash chain */
2327 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2328 while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2329 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2330 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2331 if (n)
2332 goto found;
2333 }
19baf839 2334
3d3b2d25
SH
2335 /* new hash chain */
2336 while (++h < FIB_TABLE_HASHSZ) {
2337 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2338 hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2339 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2340 if (n)
2341 goto found;
2342 }
2343 }
cb7b593c 2344 return NULL;
3d3b2d25
SH
2345
2346found:
2347 iter->tb = tb;
2348 return n;
cb7b593c 2349}
19baf839 2350
cb7b593c 2351static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2352 __releases(RCU)
19baf839 2353{
cb7b593c
SH
2354 rcu_read_unlock();
2355}
91b9a277 2356
cb7b593c
SH
2357static void seq_indent(struct seq_file *seq, int n)
2358{
2359 while (n-- > 0) seq_puts(seq, " ");
2360}
19baf839 2361
28d36e37 2362static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2363{
132adf54 2364 switch (s) {
cb7b593c
SH
2365 case RT_SCOPE_UNIVERSE: return "universe";
2366 case RT_SCOPE_SITE: return "site";
2367 case RT_SCOPE_LINK: return "link";
2368 case RT_SCOPE_HOST: return "host";
2369 case RT_SCOPE_NOWHERE: return "nowhere";
2370 default:
28d36e37 2371 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2372 return buf;
2373 }
2374}
19baf839 2375
36cbd3dc 2376static const char *const rtn_type_names[__RTN_MAX] = {
cb7b593c
SH
2377 [RTN_UNSPEC] = "UNSPEC",
2378 [RTN_UNICAST] = "UNICAST",
2379 [RTN_LOCAL] = "LOCAL",
2380 [RTN_BROADCAST] = "BROADCAST",
2381 [RTN_ANYCAST] = "ANYCAST",
2382 [RTN_MULTICAST] = "MULTICAST",
2383 [RTN_BLACKHOLE] = "BLACKHOLE",
2384 [RTN_UNREACHABLE] = "UNREACHABLE",
2385 [RTN_PROHIBIT] = "PROHIBIT",
2386 [RTN_THROW] = "THROW",
2387 [RTN_NAT] = "NAT",
2388 [RTN_XRESOLVE] = "XRESOLVE",
2389};
19baf839 2390
28d36e37 2391static inline const char *rtn_type(char *buf, size_t len, unsigned t)
cb7b593c 2392{
cb7b593c
SH
2393 if (t < __RTN_MAX && rtn_type_names[t])
2394 return rtn_type_names[t];
28d36e37 2395 snprintf(buf, len, "type %u", t);
cb7b593c 2396 return buf;
19baf839
RO
2397}
2398
cb7b593c
SH
2399/* Pretty print the trie */
2400static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2401{
cb7b593c
SH
2402 const struct fib_trie_iter *iter = seq->private;
2403 struct node *n = v;
c877efb2 2404
3d3b2d25
SH
2405 if (!node_parent_rcu(n))
2406 fib_table_print(seq, iter->tb);
095b8501 2407
cb7b593c
SH
2408 if (IS_TNODE(n)) {
2409 struct tnode *tn = (struct tnode *) n;
ab66b4a7 2410 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
91b9a277 2411
1d25cd6c 2412 seq_indent(seq, iter->depth-1);
673d57e7
HH
2413 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2414 &prf, tn->pos, tn->bits, tn->full_children,
1d25cd6c 2415 tn->empty_children);
e905a9ed 2416
cb7b593c
SH
2417 } else {
2418 struct leaf *l = (struct leaf *) n;
1328042e
SH
2419 struct leaf_info *li;
2420 struct hlist_node *node;
32ab5f80 2421 __be32 val = htonl(l->key);
cb7b593c
SH
2422
2423 seq_indent(seq, iter->depth);
673d57e7 2424 seq_printf(seq, " |-- %pI4\n", &val);
1328042e
SH
2425
2426 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2427 struct fib_alias *fa;
2428
2429 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2430 char buf1[32], buf2[32];
2431
2432 seq_indent(seq, iter->depth+1);
2433 seq_printf(seq, " /%d %s %s", li->plen,
2434 rtn_scope(buf1, sizeof(buf1),
2435 fa->fa_scope),
2436 rtn_type(buf2, sizeof(buf2),
2437 fa->fa_type));
2438 if (fa->fa_tos)
b9c4d82a 2439 seq_printf(seq, " tos=%d", fa->fa_tos);
1328042e 2440 seq_putc(seq, '\n');
cb7b593c
SH
2441 }
2442 }
19baf839 2443 }
cb7b593c 2444
19baf839
RO
2445 return 0;
2446}
2447
f690808e 2448static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2449 .start = fib_trie_seq_start,
2450 .next = fib_trie_seq_next,
2451 .stop = fib_trie_seq_stop,
2452 .show = fib_trie_seq_show,
19baf839
RO
2453};
2454
cb7b593c 2455static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2456{
1c340b2f
DL
2457 return seq_open_net(inode, file, &fib_trie_seq_ops,
2458 sizeof(struct fib_trie_iter));
19baf839
RO
2459}
2460
9a32144e 2461static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2462 .owner = THIS_MODULE,
2463 .open = fib_trie_seq_open,
2464 .read = seq_read,
2465 .llseek = seq_lseek,
1c340b2f 2466 .release = seq_release_net,
19baf839
RO
2467};
2468
8315f5d8
SH
2469struct fib_route_iter {
2470 struct seq_net_private p;
2471 struct trie *main_trie;
2472 loff_t pos;
2473 t_key key;
2474};
2475
2476static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2477{
2478 struct leaf *l = NULL;
2479 struct trie *t = iter->main_trie;
2480
2481 /* use cache location of last found key */
2482 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2483 pos -= iter->pos;
2484 else {
2485 iter->pos = 0;
2486 l = trie_firstleaf(t);
2487 }
2488
2489 while (l && pos-- > 0) {
2490 iter->pos++;
2491 l = trie_nextleaf(l);
2492 }
2493
2494 if (l)
2495 iter->key = pos; /* remember it */
2496 else
2497 iter->pos = 0; /* forget it */
2498
2499 return l;
2500}
2501
2502static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2503 __acquires(RCU)
2504{
2505 struct fib_route_iter *iter = seq->private;
2506 struct fib_table *tb;
2507
2508 rcu_read_lock();
1218854a 2509 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
8315f5d8
SH
2510 if (!tb)
2511 return NULL;
2512
2513 iter->main_trie = (struct trie *) tb->tb_data;
2514 if (*pos == 0)
2515 return SEQ_START_TOKEN;
2516 else
2517 return fib_route_get_idx(iter, *pos - 1);
2518}
2519
2520static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2521{
2522 struct fib_route_iter *iter = seq->private;
2523 struct leaf *l = v;
2524
2525 ++*pos;
2526 if (v == SEQ_START_TOKEN) {
2527 iter->pos = 0;
2528 l = trie_firstleaf(iter->main_trie);
2529 } else {
2530 iter->pos++;
2531 l = trie_nextleaf(l);
2532 }
2533
2534 if (l)
2535 iter->key = l->key;
2536 else
2537 iter->pos = 0;
2538 return l;
2539}
2540
2541static void fib_route_seq_stop(struct seq_file *seq, void *v)
2542 __releases(RCU)
2543{
2544 rcu_read_unlock();
2545}
2546
32ab5f80 2547static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2548{
cb7b593c
SH
2549 static unsigned type2flags[RTN_MAX + 1] = {
2550 [7] = RTF_REJECT, [8] = RTF_REJECT,
2551 };
2552 unsigned flags = type2flags[type];
19baf839 2553
cb7b593c
SH
2554 if (fi && fi->fib_nh->nh_gw)
2555 flags |= RTF_GATEWAY;
32ab5f80 2556 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2557 flags |= RTF_HOST;
2558 flags |= RTF_UP;
2559 return flags;
19baf839
RO
2560}
2561
cb7b593c
SH
2562/*
2563 * This outputs /proc/net/route.
2564 * The format of the file is not supposed to be changed
2565 * and needs to be same as fib_hash output to avoid breaking
2566 * legacy utilities
2567 */
2568static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2569{
cb7b593c 2570 struct leaf *l = v;
1328042e
SH
2571 struct leaf_info *li;
2572 struct hlist_node *node;
19baf839 2573
cb7b593c
SH
2574 if (v == SEQ_START_TOKEN) {
2575 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2576 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2577 "\tWindow\tIRTT");
2578 return 0;
2579 }
19baf839 2580
1328042e 2581 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
cb7b593c 2582 struct fib_alias *fa;
32ab5f80 2583 __be32 mask, prefix;
91b9a277 2584
cb7b593c
SH
2585 mask = inet_make_mask(li->plen);
2586 prefix = htonl(l->key);
19baf839 2587
cb7b593c 2588 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2589 const struct fib_info *fi = fa->fa_info;
cb7b593c 2590 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
5e659e4c 2591 int len;
19baf839 2592
cb7b593c
SH
2593 if (fa->fa_type == RTN_BROADCAST
2594 || fa->fa_type == RTN_MULTICAST)
2595 continue;
19baf839 2596
cb7b593c 2597 if (fi)
5e659e4c
PE
2598 seq_printf(seq,
2599 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2600 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c
SH
2601 fi->fib_dev ? fi->fib_dev->name : "*",
2602 prefix,
2603 fi->fib_nh->nh_gw, flags, 0, 0,
2604 fi->fib_priority,
2605 mask,
a07f5f50
SH
2606 (fi->fib_advmss ?
2607 fi->fib_advmss + 40 : 0),
cb7b593c 2608 fi->fib_window,
5e659e4c 2609 fi->fib_rtt >> 3, &len);
cb7b593c 2610 else
5e659e4c
PE
2611 seq_printf(seq,
2612 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2613 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c 2614 prefix, 0, flags, 0, 0, 0,
5e659e4c 2615 mask, 0, 0, 0, &len);
19baf839 2616
5e659e4c 2617 seq_printf(seq, "%*s\n", 127 - len, "");
cb7b593c 2618 }
19baf839
RO
2619 }
2620
2621 return 0;
2622}
2623
f690808e 2624static const struct seq_operations fib_route_seq_ops = {
8315f5d8
SH
2625 .start = fib_route_seq_start,
2626 .next = fib_route_seq_next,
2627 .stop = fib_route_seq_stop,
cb7b593c 2628 .show = fib_route_seq_show,
19baf839
RO
2629};
2630
cb7b593c 2631static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2632{
1c340b2f 2633 return seq_open_net(inode, file, &fib_route_seq_ops,
8315f5d8 2634 sizeof(struct fib_route_iter));
19baf839
RO
2635}
2636
9a32144e 2637static const struct file_operations fib_route_fops = {
cb7b593c
SH
2638 .owner = THIS_MODULE,
2639 .open = fib_route_seq_open,
2640 .read = seq_read,
2641 .llseek = seq_lseek,
1c340b2f 2642 .release = seq_release_net,
19baf839
RO
2643};
2644
61a02653 2645int __net_init fib_proc_init(struct net *net)
19baf839 2646{
61a02653 2647 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
cb7b593c
SH
2648 goto out1;
2649
61a02653
DL
2650 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2651 &fib_triestat_fops))
cb7b593c
SH
2652 goto out2;
2653
61a02653 2654 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
cb7b593c
SH
2655 goto out3;
2656
19baf839 2657 return 0;
cb7b593c
SH
2658
2659out3:
61a02653 2660 proc_net_remove(net, "fib_triestat");
cb7b593c 2661out2:
61a02653 2662 proc_net_remove(net, "fib_trie");
cb7b593c
SH
2663out1:
2664 return -ENOMEM;
19baf839
RO
2665}
2666
61a02653 2667void __net_exit fib_proc_exit(struct net *net)
19baf839 2668{
61a02653
DL
2669 proc_net_remove(net, "fib_trie");
2670 proc_net_remove(net, "fib_triestat");
2671 proc_net_remove(net, "route");
19baf839
RO
2672}
2673
2674#endif /* CONFIG_PROC_FS */