]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/fib_trie.c
net: mark read-only arrays as const
[net-next-2.6.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
19baf839
RO
25 *
26 * Code from fib_hash has been reused which includes the following header:
27 *
28 *
29 * INET An implementation of the TCP/IP protocol suite for the LINUX
30 * operating system. INET is implemented using the BSD Socket
31 * interface as the means of communication with the user level.
32 *
33 * IPv4 FIB: lookup engine and maintenance routines.
34 *
35 *
36 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
37 *
38 * This program is free software; you can redistribute it and/or
39 * modify it under the terms of the GNU General Public License
40 * as published by the Free Software Foundation; either version
41 * 2 of the License, or (at your option) any later version.
fd966255
RO
42 *
43 * Substantial contributions to this work comes from:
44 *
45 * David S. Miller, <davem@davemloft.net>
46 * Stephen Hemminger <shemminger@osdl.org>
47 * Paul E. McKenney <paulmck@us.ibm.com>
48 * Patrick McHardy <kaber@trash.net>
19baf839
RO
49 */
50
05eee48c 51#define VERSION "0.408"
19baf839 52
19baf839
RO
53#include <asm/uaccess.h>
54#include <asm/system.h>
1977f032 55#include <linux/bitops.h>
19baf839
RO
56#include <linux/types.h>
57#include <linux/kernel.h>
19baf839
RO
58#include <linux/mm.h>
59#include <linux/string.h>
60#include <linux/socket.h>
61#include <linux/sockios.h>
62#include <linux/errno.h>
63#include <linux/in.h>
64#include <linux/inet.h>
cd8787ab 65#include <linux/inetdevice.h>
19baf839
RO
66#include <linux/netdevice.h>
67#include <linux/if_arp.h>
68#include <linux/proc_fs.h>
2373ce1c 69#include <linux/rcupdate.h>
19baf839
RO
70#include <linux/skbuff.h>
71#include <linux/netlink.h>
72#include <linux/init.h>
73#include <linux/list.h>
457c4cbc 74#include <net/net_namespace.h>
19baf839
RO
75#include <net/ip.h>
76#include <net/protocol.h>
77#include <net/route.h>
78#include <net/tcp.h>
79#include <net/sock.h>
80#include <net/ip_fib.h>
81#include "fib_lookup.h"
82
06ef921d 83#define MAX_STAT_DEPTH 32
19baf839 84
19baf839 85#define KEYLENGTH (8*sizeof(t_key))
19baf839 86
19baf839
RO
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL
2373ce1c
RO
92#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
93
91b9a277
OJ
94#define IS_TNODE(n) (!(n->parent & T_LEAF))
95#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839
RO
96
97struct node {
91b9a277 98 unsigned long parent;
8d965444 99 t_key key;
19baf839
RO
100};
101
102struct leaf {
91b9a277 103 unsigned long parent;
8d965444 104 t_key key;
19baf839 105 struct hlist_head list;
2373ce1c 106 struct rcu_head rcu;
19baf839
RO
107};
108
109struct leaf_info {
110 struct hlist_node hlist;
2373ce1c 111 struct rcu_head rcu;
19baf839
RO
112 int plen;
113 struct list_head falh;
114};
115
116struct tnode {
91b9a277 117 unsigned long parent;
8d965444 118 t_key key;
112d8cfc
ED
119 unsigned char pos; /* 2log(KEYLENGTH) bits needed */
120 unsigned char bits; /* 2log(KEYLENGTH) bits needed */
8d965444
ED
121 unsigned int full_children; /* KEYLENGTH bits needed */
122 unsigned int empty_children; /* KEYLENGTH bits needed */
15be75cd
SH
123 union {
124 struct rcu_head rcu;
125 struct work_struct work;
e0f7cb8c 126 struct tnode *tnode_free;
15be75cd 127 };
91b9a277 128 struct node *child[0];
19baf839
RO
129};
130
131#ifdef CONFIG_IP_FIB_TRIE_STATS
132struct trie_use_stats {
133 unsigned int gets;
134 unsigned int backtrack;
135 unsigned int semantic_match_passed;
136 unsigned int semantic_match_miss;
137 unsigned int null_node_hit;
2f36895a 138 unsigned int resize_node_skipped;
19baf839
RO
139};
140#endif
141
142struct trie_stat {
143 unsigned int totdepth;
144 unsigned int maxdepth;
145 unsigned int tnodes;
146 unsigned int leaves;
147 unsigned int nullpointers;
93672292 148 unsigned int prefixes;
06ef921d 149 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 150};
19baf839
RO
151
152struct trie {
91b9a277 153 struct node *trie;
19baf839
RO
154#ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
156#endif
19baf839
RO
157};
158
19baf839 159static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
a07f5f50
SH
160static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
161 int wasfull);
19baf839 162static struct node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
163static struct tnode *inflate(struct trie *t, struct tnode *tn);
164static struct tnode *halve(struct trie *t, struct tnode *tn);
e0f7cb8c
JP
165/* tnodes to free after resize(); protected by RTNL */
166static struct tnode *tnode_free_head;
c3059477
JP
167static size_t tnode_free_size;
168
169/*
170 * synchronize_rcu after call_rcu for that many pages; it should be especially
171 * useful before resizing the root node with PREEMPT_NONE configs; the value was
172 * obtained experimentally, aiming to avoid visible slowdown.
173 */
174static const int sync_pages = 128;
19baf839 175
e18b890b 176static struct kmem_cache *fn_alias_kmem __read_mostly;
bc3c8c1e 177static struct kmem_cache *trie_leaf_kmem __read_mostly;
19baf839 178
06801916
SH
179static inline struct tnode *node_parent(struct node *node)
180{
b59cfbf7
ED
181 return (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
182}
183
184static inline struct tnode *node_parent_rcu(struct node *node)
185{
186 struct tnode *ret = node_parent(node);
06801916 187
06801916
SH
188 return rcu_dereference(ret);
189}
190
6440cc9e
SH
191/* Same as rcu_assign_pointer
192 * but that macro() assumes that value is a pointer.
193 */
06801916
SH
194static inline void node_set_parent(struct node *node, struct tnode *ptr)
195{
6440cc9e
SH
196 smp_wmb();
197 node->parent = (unsigned long)ptr | NODE_TYPE(node);
06801916 198}
2373ce1c 199
b59cfbf7
ED
200static inline struct node *tnode_get_child(struct tnode *tn, unsigned int i)
201{
202 BUG_ON(i >= 1U << tn->bits);
2373ce1c 203
b59cfbf7
ED
204 return tn->child[i];
205}
206
207static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i)
19baf839 208{
b59cfbf7 209 struct node *ret = tnode_get_child(tn, i);
19baf839 210
b59cfbf7 211 return rcu_dereference(ret);
19baf839
RO
212}
213
bb435b8d 214static inline int tnode_child_length(const struct tnode *tn)
19baf839 215{
91b9a277 216 return 1 << tn->bits;
19baf839
RO
217}
218
ab66b4a7
SH
219static inline t_key mask_pfx(t_key k, unsigned short l)
220{
221 return (l == 0) ? 0 : k >> (KEYLENGTH-l) << (KEYLENGTH-l);
222}
223
19baf839
RO
224static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
225{
91b9a277 226 if (offset < KEYLENGTH)
19baf839 227 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 228 else
19baf839
RO
229 return 0;
230}
231
232static inline int tkey_equals(t_key a, t_key b)
233{
c877efb2 234 return a == b;
19baf839
RO
235}
236
237static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
238{
c877efb2
SH
239 if (bits == 0 || offset >= KEYLENGTH)
240 return 1;
91b9a277
OJ
241 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
242 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 243}
19baf839
RO
244
245static inline int tkey_mismatch(t_key a, int offset, t_key b)
246{
247 t_key diff = a ^ b;
248 int i = offset;
249
c877efb2
SH
250 if (!diff)
251 return 0;
252 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
253 i++;
254 return i;
255}
256
19baf839 257/*
e905a9ed
YH
258 To understand this stuff, an understanding of keys and all their bits is
259 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
260 all of the bits in that key are significant.
261
262 Consider a node 'n' and its parent 'tp'.
263
e905a9ed
YH
264 If n is a leaf, every bit in its key is significant. Its presence is
265 necessitated by path compression, since during a tree traversal (when
266 searching for a leaf - unless we are doing an insertion) we will completely
267 ignore all skipped bits we encounter. Thus we need to verify, at the end of
268 a potentially successful search, that we have indeed been walking the
19baf839
RO
269 correct key path.
270
e905a9ed
YH
271 Note that we can never "miss" the correct key in the tree if present by
272 following the wrong path. Path compression ensures that segments of the key
273 that are the same for all keys with a given prefix are skipped, but the
274 skipped part *is* identical for each node in the subtrie below the skipped
275 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
276 call to tkey_sub_equals() in trie_insert().
277
e905a9ed 278 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
279 have many different meanings.
280
e905a9ed 281 Example:
19baf839
RO
282 _________________________________________________________________
283 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
284 -----------------------------------------------------------------
e905a9ed 285 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
286
287 _________________________________________________________________
288 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
289 -----------------------------------------------------------------
290 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
291
292 tp->pos = 7
293 tp->bits = 3
294 n->pos = 15
91b9a277 295 n->bits = 4
19baf839 296
e905a9ed
YH
297 First, let's just ignore the bits that come before the parent tp, that is
298 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
299 not use them for anything.
300
301 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 302 index into the parent's child array. That is, they will be used to find
19baf839
RO
303 'n' among tp's children.
304
305 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
306 for the node n.
307
e905a9ed 308 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
309 of the bits are really not needed or indeed known in n->key.
310
e905a9ed 311 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 312 n's child array, and will of course be different for each child.
e905a9ed 313
c877efb2 314
19baf839
RO
315 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
316 at this point.
317
318*/
319
0c7770c7 320static inline void check_tnode(const struct tnode *tn)
19baf839 321{
0c7770c7 322 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
323}
324
f5026fab
DL
325static const int halve_threshold = 25;
326static const int inflate_threshold = 50;
345aa031
JP
327static const int halve_threshold_root = 15;
328static const int inflate_threshold_root = 25;
19baf839 329
be916cde
JP
330static int inflate_threshold_root_fix;
331#define INFLATE_FIX_MAX 10 /* a comment in resize() */
2373ce1c
RO
332
333static void __alias_free_mem(struct rcu_head *head)
19baf839 334{
2373ce1c
RO
335 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
336 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
337}
338
2373ce1c 339static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 340{
2373ce1c
RO
341 call_rcu(&fa->rcu, __alias_free_mem);
342}
91b9a277 343
2373ce1c
RO
344static void __leaf_free_rcu(struct rcu_head *head)
345{
bc3c8c1e
SH
346 struct leaf *l = container_of(head, struct leaf, rcu);
347 kmem_cache_free(trie_leaf_kmem, l);
2373ce1c 348}
91b9a277 349
387a5487
SH
350static inline void free_leaf(struct leaf *l)
351{
352 call_rcu_bh(&l->rcu, __leaf_free_rcu);
353}
354
2373ce1c 355static void __leaf_info_free_rcu(struct rcu_head *head)
19baf839 356{
2373ce1c 357 kfree(container_of(head, struct leaf_info, rcu));
19baf839
RO
358}
359
2373ce1c 360static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 361{
2373ce1c 362 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
19baf839
RO
363}
364
8d965444 365static struct tnode *tnode_alloc(size_t size)
f0e36f8c 366{
2373ce1c 367 if (size <= PAGE_SIZE)
8d965444 368 return kzalloc(size, GFP_KERNEL);
15be75cd
SH
369 else
370 return __vmalloc(size, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
371}
2373ce1c 372
15be75cd
SH
373static void __tnode_vfree(struct work_struct *arg)
374{
375 struct tnode *tn = container_of(arg, struct tnode, work);
376 vfree(tn);
f0e36f8c
PM
377}
378
2373ce1c 379static void __tnode_free_rcu(struct rcu_head *head)
f0e36f8c 380{
2373ce1c 381 struct tnode *tn = container_of(head, struct tnode, rcu);
8d965444
ED
382 size_t size = sizeof(struct tnode) +
383 (sizeof(struct node *) << tn->bits);
f0e36f8c
PM
384
385 if (size <= PAGE_SIZE)
386 kfree(tn);
15be75cd
SH
387 else {
388 INIT_WORK(&tn->work, __tnode_vfree);
389 schedule_work(&tn->work);
390 }
f0e36f8c
PM
391}
392
2373ce1c
RO
393static inline void tnode_free(struct tnode *tn)
394{
387a5487
SH
395 if (IS_LEAF(tn))
396 free_leaf((struct leaf *) tn);
397 else
550e29bc 398 call_rcu(&tn->rcu, __tnode_free_rcu);
2373ce1c
RO
399}
400
e0f7cb8c
JP
401static void tnode_free_safe(struct tnode *tn)
402{
403 BUG_ON(IS_LEAF(tn));
7b85576d
JP
404 tn->tnode_free = tnode_free_head;
405 tnode_free_head = tn;
c3059477
JP
406 tnode_free_size += sizeof(struct tnode) +
407 (sizeof(struct node *) << tn->bits);
e0f7cb8c
JP
408}
409
410static void tnode_free_flush(void)
411{
412 struct tnode *tn;
413
414 while ((tn = tnode_free_head)) {
415 tnode_free_head = tn->tnode_free;
416 tn->tnode_free = NULL;
417 tnode_free(tn);
418 }
c3059477
JP
419
420 if (tnode_free_size >= PAGE_SIZE * sync_pages) {
421 tnode_free_size = 0;
422 synchronize_rcu();
423 }
e0f7cb8c
JP
424}
425
2373ce1c
RO
426static struct leaf *leaf_new(void)
427{
bc3c8c1e 428 struct leaf *l = kmem_cache_alloc(trie_leaf_kmem, GFP_KERNEL);
2373ce1c
RO
429 if (l) {
430 l->parent = T_LEAF;
431 INIT_HLIST_HEAD(&l->list);
432 }
433 return l;
434}
435
436static struct leaf_info *leaf_info_new(int plen)
437{
438 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
439 if (li) {
440 li->plen = plen;
441 INIT_LIST_HEAD(&li->falh);
442 }
443 return li;
444}
445
a07f5f50 446static struct tnode *tnode_new(t_key key, int pos, int bits)
19baf839 447{
8d965444 448 size_t sz = sizeof(struct tnode) + (sizeof(struct node *) << bits);
f0e36f8c 449 struct tnode *tn = tnode_alloc(sz);
19baf839 450
91b9a277 451 if (tn) {
2373ce1c 452 tn->parent = T_TNODE;
19baf839
RO
453 tn->pos = pos;
454 tn->bits = bits;
455 tn->key = key;
456 tn->full_children = 0;
457 tn->empty_children = 1<<bits;
458 }
c877efb2 459
8d965444
ED
460 pr_debug("AT %p s=%u %lu\n", tn, (unsigned int) sizeof(struct tnode),
461 (unsigned long) (sizeof(struct node) << bits));
19baf839
RO
462 return tn;
463}
464
19baf839
RO
465/*
466 * Check whether a tnode 'n' is "full", i.e. it is an internal node
467 * and no bits are skipped. See discussion in dyntree paper p. 6
468 */
469
bb435b8d 470static inline int tnode_full(const struct tnode *tn, const struct node *n)
19baf839 471{
c877efb2 472 if (n == NULL || IS_LEAF(n))
19baf839
RO
473 return 0;
474
475 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
476}
477
a07f5f50
SH
478static inline void put_child(struct trie *t, struct tnode *tn, int i,
479 struct node *n)
19baf839
RO
480{
481 tnode_put_child_reorg(tn, i, n, -1);
482}
483
c877efb2 484 /*
19baf839
RO
485 * Add a child at position i overwriting the old value.
486 * Update the value of full_children and empty_children.
487 */
488
a07f5f50
SH
489static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n,
490 int wasfull)
19baf839 491{
2373ce1c 492 struct node *chi = tn->child[i];
19baf839
RO
493 int isfull;
494
0c7770c7
SH
495 BUG_ON(i >= 1<<tn->bits);
496
19baf839
RO
497 /* update emptyChildren */
498 if (n == NULL && chi != NULL)
499 tn->empty_children++;
500 else if (n != NULL && chi == NULL)
501 tn->empty_children--;
c877efb2 502
19baf839 503 /* update fullChildren */
91b9a277 504 if (wasfull == -1)
19baf839
RO
505 wasfull = tnode_full(tn, chi);
506
507 isfull = tnode_full(tn, n);
c877efb2 508 if (wasfull && !isfull)
19baf839 509 tn->full_children--;
c877efb2 510 else if (!wasfull && isfull)
19baf839 511 tn->full_children++;
91b9a277 512
c877efb2 513 if (n)
06801916 514 node_set_parent(n, tn);
19baf839 515
2373ce1c 516 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
517}
518
c877efb2 519static struct node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
520{
521 int i;
2f36895a 522 int err = 0;
2f80b3c8 523 struct tnode *old_tn;
e6308be8
RO
524 int inflate_threshold_use;
525 int halve_threshold_use;
05eee48c 526 int max_resize;
19baf839 527
e905a9ed 528 if (!tn)
19baf839
RO
529 return NULL;
530
0c7770c7
SH
531 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
532 tn, inflate_threshold, halve_threshold);
19baf839
RO
533
534 /* No children */
535 if (tn->empty_children == tnode_child_length(tn)) {
e0f7cb8c 536 tnode_free_safe(tn);
19baf839
RO
537 return NULL;
538 }
539 /* One child */
540 if (tn->empty_children == tnode_child_length(tn) - 1)
541 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 542 struct node *n;
19baf839 543
91b9a277 544 n = tn->child[i];
2373ce1c 545 if (!n)
91b9a277 546 continue;
91b9a277
OJ
547
548 /* compress one level */
06801916 549 node_set_parent(n, NULL);
e0f7cb8c 550 tnode_free_safe(tn);
91b9a277 551 return n;
19baf839 552 }
c877efb2 553 /*
19baf839
RO
554 * Double as long as the resulting node has a number of
555 * nonempty nodes that are above the threshold.
556 */
557
558 /*
c877efb2
SH
559 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
560 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 561 * Telecommunications, page 6:
c877efb2 562 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
563 * children in the *doubled* node is at least 'high'."
564 *
c877efb2
SH
565 * 'high' in this instance is the variable 'inflate_threshold'. It
566 * is expressed as a percentage, so we multiply it with
567 * tnode_child_length() and instead of multiplying by 2 (since the
568 * child array will be doubled by inflate()) and multiplying
569 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 570 * multiply the left-hand side by 50.
c877efb2
SH
571 *
572 * The left-hand side may look a bit weird: tnode_child_length(tn)
573 * - tn->empty_children is of course the number of non-null children
574 * in the current node. tn->full_children is the number of "full"
19baf839 575 * children, that is non-null tnodes with a skip value of 0.
c877efb2 576 * All of those will be doubled in the resulting inflated tnode, so
19baf839 577 * we just count them one extra time here.
c877efb2 578 *
19baf839 579 * A clearer way to write this would be:
c877efb2 580 *
19baf839 581 * to_be_doubled = tn->full_children;
c877efb2 582 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
583 * tn->full_children;
584 *
585 * new_child_length = tnode_child_length(tn) * 2;
586 *
c877efb2 587 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
588 * new_child_length;
589 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
590 *
591 * ...and so on, tho it would mess up the while () loop.
592 *
19baf839
RO
593 * anyway,
594 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
595 * inflate_threshold
c877efb2 596 *
19baf839
RO
597 * avoid a division:
598 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
599 * inflate_threshold * new_child_length
c877efb2 600 *
19baf839 601 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 602 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 603 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 604 *
19baf839 605 * expand new_child_length:
c877efb2 606 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 607 * tn->full_children) >=
19baf839 608 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 609 *
19baf839 610 * shorten again:
c877efb2 611 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 612 * tn->empty_children) >= inflate_threshold *
19baf839 613 * tnode_child_length(tn)
c877efb2 614 *
19baf839
RO
615 */
616
617 check_tnode(tn);
c877efb2 618
e6308be8
RO
619 /* Keep root node larger */
620
132adf54 621 if (!tn->parent)
be916cde
JP
622 inflate_threshold_use = inflate_threshold_root +
623 inflate_threshold_root_fix;
e905a9ed 624 else
e6308be8
RO
625 inflate_threshold_use = inflate_threshold;
626
2f36895a 627 err = 0;
05eee48c
RO
628 max_resize = 10;
629 while ((tn->full_children > 0 && max_resize-- &&
a07f5f50
SH
630 50 * (tn->full_children + tnode_child_length(tn)
631 - tn->empty_children)
632 >= inflate_threshold_use * tnode_child_length(tn))) {
19baf839 633
2f80b3c8
RO
634 old_tn = tn;
635 tn = inflate(t, tn);
a07f5f50 636
2f80b3c8
RO
637 if (IS_ERR(tn)) {
638 tn = old_tn;
2f36895a
RO
639#ifdef CONFIG_IP_FIB_TRIE_STATS
640 t->stats.resize_node_skipped++;
641#endif
642 break;
643 }
19baf839
RO
644 }
645
05eee48c 646 if (max_resize < 0) {
be916cde
JP
647 if (!tn->parent) {
648 /*
649 * It was observed that during large updates even
650 * inflate_threshold_root = 35 might be needed to avoid
651 * this warning; but it should be temporary, so let's
652 * try to handle this automatically.
653 */
654 if (inflate_threshold_root_fix < INFLATE_FIX_MAX)
655 inflate_threshold_root_fix++;
656 else
657 pr_warning("Fix inflate_threshold_root."
658 " Now=%d size=%d bits fix=%d\n",
659 inflate_threshold_root, tn->bits,
660 inflate_threshold_root_fix);
661 } else {
a07f5f50
SH
662 pr_warning("Fix inflate_threshold."
663 " Now=%d size=%d bits\n",
664 inflate_threshold, tn->bits);
be916cde
JP
665 }
666 } else if (max_resize > 3 && !tn->parent && inflate_threshold_root_fix)
667 inflate_threshold_root_fix--;
05eee48c 668
19baf839
RO
669 check_tnode(tn);
670
671 /*
672 * Halve as long as the number of empty children in this
673 * node is above threshold.
674 */
2f36895a 675
e6308be8
RO
676
677 /* Keep root node larger */
678
132adf54 679 if (!tn->parent)
e6308be8 680 halve_threshold_use = halve_threshold_root;
e905a9ed 681 else
e6308be8
RO
682 halve_threshold_use = halve_threshold;
683
2f36895a 684 err = 0;
05eee48c
RO
685 max_resize = 10;
686 while (tn->bits > 1 && max_resize-- &&
19baf839 687 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 688 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 689
2f80b3c8
RO
690 old_tn = tn;
691 tn = halve(t, tn);
692 if (IS_ERR(tn)) {
693 tn = old_tn;
2f36895a
RO
694#ifdef CONFIG_IP_FIB_TRIE_STATS
695 t->stats.resize_node_skipped++;
696#endif
697 break;
698 }
699 }
19baf839 700
05eee48c
RO
701 if (max_resize < 0) {
702 if (!tn->parent)
a07f5f50
SH
703 pr_warning("Fix halve_threshold_root."
704 " Now=%d size=%d bits\n",
705 halve_threshold_root, tn->bits);
05eee48c 706 else
a07f5f50
SH
707 pr_warning("Fix halve_threshold."
708 " Now=%d size=%d bits\n",
709 halve_threshold, tn->bits);
05eee48c 710 }
c877efb2 711
19baf839 712 /* Only one child remains */
19baf839
RO
713 if (tn->empty_children == tnode_child_length(tn) - 1)
714 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 715 struct node *n;
19baf839 716
91b9a277 717 n = tn->child[i];
2373ce1c 718 if (!n)
91b9a277 719 continue;
91b9a277
OJ
720
721 /* compress one level */
722
06801916 723 node_set_parent(n, NULL);
e0f7cb8c 724 tnode_free_safe(tn);
91b9a277 725 return n;
19baf839
RO
726 }
727
728 return (struct node *) tn;
729}
730
2f80b3c8 731static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839 732{
19baf839
RO
733 struct tnode *oldtnode = tn;
734 int olen = tnode_child_length(tn);
735 int i;
736
0c7770c7 737 pr_debug("In inflate\n");
19baf839
RO
738
739 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
740
0c7770c7 741 if (!tn)
2f80b3c8 742 return ERR_PTR(-ENOMEM);
2f36895a
RO
743
744 /*
c877efb2
SH
745 * Preallocate and store tnodes before the actual work so we
746 * don't get into an inconsistent state if memory allocation
747 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
748 * of tnode is ignored.
749 */
91b9a277
OJ
750
751 for (i = 0; i < olen; i++) {
a07f5f50 752 struct tnode *inode;
2f36895a 753
a07f5f50 754 inode = (struct tnode *) tnode_get_child(oldtnode, i);
2f36895a
RO
755 if (inode &&
756 IS_TNODE(inode) &&
757 inode->pos == oldtnode->pos + oldtnode->bits &&
758 inode->bits > 1) {
759 struct tnode *left, *right;
ab66b4a7 760 t_key m = ~0U << (KEYLENGTH - 1) >> inode->pos;
c877efb2 761
2f36895a
RO
762 left = tnode_new(inode->key&(~m), inode->pos + 1,
763 inode->bits - 1);
2f80b3c8
RO
764 if (!left)
765 goto nomem;
91b9a277 766
2f36895a
RO
767 right = tnode_new(inode->key|m, inode->pos + 1,
768 inode->bits - 1);
769
e905a9ed 770 if (!right) {
2f80b3c8
RO
771 tnode_free(left);
772 goto nomem;
e905a9ed 773 }
2f36895a
RO
774
775 put_child(t, tn, 2*i, (struct node *) left);
776 put_child(t, tn, 2*i+1, (struct node *) right);
777 }
778 }
779
91b9a277 780 for (i = 0; i < olen; i++) {
c95aaf9a 781 struct tnode *inode;
19baf839 782 struct node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
783 struct tnode *left, *right;
784 int size, j;
c877efb2 785
19baf839
RO
786 /* An empty child */
787 if (node == NULL)
788 continue;
789
790 /* A leaf or an internal node with skipped bits */
791
c877efb2 792 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 793 tn->pos + tn->bits - 1) {
a07f5f50
SH
794 if (tkey_extract_bits(node->key,
795 oldtnode->pos + oldtnode->bits,
796 1) == 0)
19baf839
RO
797 put_child(t, tn, 2*i, node);
798 else
799 put_child(t, tn, 2*i+1, node);
800 continue;
801 }
802
803 /* An internal node with two children */
804 inode = (struct tnode *) node;
805
806 if (inode->bits == 1) {
807 put_child(t, tn, 2*i, inode->child[0]);
808 put_child(t, tn, 2*i+1, inode->child[1]);
809
e0f7cb8c 810 tnode_free_safe(inode);
91b9a277 811 continue;
19baf839
RO
812 }
813
91b9a277
OJ
814 /* An internal node with more than two children */
815
816 /* We will replace this node 'inode' with two new
817 * ones, 'left' and 'right', each with half of the
818 * original children. The two new nodes will have
819 * a position one bit further down the key and this
820 * means that the "significant" part of their keys
821 * (see the discussion near the top of this file)
822 * will differ by one bit, which will be "0" in
823 * left's key and "1" in right's key. Since we are
824 * moving the key position by one step, the bit that
825 * we are moving away from - the bit at position
826 * (inode->pos) - is the one that will differ between
827 * left and right. So... we synthesize that bit in the
828 * two new keys.
829 * The mask 'm' below will be a single "one" bit at
830 * the position (inode->pos)
831 */
19baf839 832
91b9a277
OJ
833 /* Use the old key, but set the new significant
834 * bit to zero.
835 */
2f36895a 836
91b9a277
OJ
837 left = (struct tnode *) tnode_get_child(tn, 2*i);
838 put_child(t, tn, 2*i, NULL);
2f36895a 839
91b9a277 840 BUG_ON(!left);
2f36895a 841
91b9a277
OJ
842 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
843 put_child(t, tn, 2*i+1, NULL);
19baf839 844
91b9a277 845 BUG_ON(!right);
19baf839 846
91b9a277
OJ
847 size = tnode_child_length(left);
848 for (j = 0; j < size; j++) {
849 put_child(t, left, j, inode->child[j]);
850 put_child(t, right, j, inode->child[j + size]);
19baf839 851 }
91b9a277
OJ
852 put_child(t, tn, 2*i, resize(t, left));
853 put_child(t, tn, 2*i+1, resize(t, right));
854
e0f7cb8c 855 tnode_free_safe(inode);
19baf839 856 }
e0f7cb8c 857 tnode_free_safe(oldtnode);
19baf839 858 return tn;
2f80b3c8
RO
859nomem:
860 {
861 int size = tnode_child_length(tn);
862 int j;
863
0c7770c7 864 for (j = 0; j < size; j++)
2f80b3c8
RO
865 if (tn->child[j])
866 tnode_free((struct tnode *)tn->child[j]);
867
868 tnode_free(tn);
0c7770c7 869
2f80b3c8
RO
870 return ERR_PTR(-ENOMEM);
871 }
19baf839
RO
872}
873
2f80b3c8 874static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
875{
876 struct tnode *oldtnode = tn;
877 struct node *left, *right;
878 int i;
879 int olen = tnode_child_length(tn);
880
0c7770c7 881 pr_debug("In halve\n");
c877efb2
SH
882
883 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 884
2f80b3c8
RO
885 if (!tn)
886 return ERR_PTR(-ENOMEM);
2f36895a
RO
887
888 /*
c877efb2
SH
889 * Preallocate and store tnodes before the actual work so we
890 * don't get into an inconsistent state if memory allocation
891 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
892 * of tnode is ignored.
893 */
894
91b9a277 895 for (i = 0; i < olen; i += 2) {
2f36895a
RO
896 left = tnode_get_child(oldtnode, i);
897 right = tnode_get_child(oldtnode, i+1);
c877efb2 898
2f36895a 899 /* Two nonempty children */
0c7770c7 900 if (left && right) {
2f80b3c8 901 struct tnode *newn;
0c7770c7 902
2f80b3c8 903 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
904
905 if (!newn)
2f80b3c8 906 goto nomem;
0c7770c7 907
2f80b3c8 908 put_child(t, tn, i/2, (struct node *)newn);
2f36895a 909 }
2f36895a 910
2f36895a 911 }
19baf839 912
91b9a277
OJ
913 for (i = 0; i < olen; i += 2) {
914 struct tnode *newBinNode;
915
19baf839
RO
916 left = tnode_get_child(oldtnode, i);
917 right = tnode_get_child(oldtnode, i+1);
c877efb2 918
19baf839
RO
919 /* At least one of the children is empty */
920 if (left == NULL) {
921 if (right == NULL) /* Both are empty */
922 continue;
923 put_child(t, tn, i/2, right);
91b9a277 924 continue;
0c7770c7 925 }
91b9a277
OJ
926
927 if (right == NULL) {
19baf839 928 put_child(t, tn, i/2, left);
91b9a277
OJ
929 continue;
930 }
c877efb2 931
19baf839 932 /* Two nonempty children */
91b9a277
OJ
933 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
934 put_child(t, tn, i/2, NULL);
91b9a277
OJ
935 put_child(t, newBinNode, 0, left);
936 put_child(t, newBinNode, 1, right);
937 put_child(t, tn, i/2, resize(t, newBinNode));
19baf839 938 }
e0f7cb8c 939 tnode_free_safe(oldtnode);
19baf839 940 return tn;
2f80b3c8
RO
941nomem:
942 {
943 int size = tnode_child_length(tn);
944 int j;
945
0c7770c7 946 for (j = 0; j < size; j++)
2f80b3c8
RO
947 if (tn->child[j])
948 tnode_free((struct tnode *)tn->child[j]);
949
950 tnode_free(tn);
0c7770c7 951
2f80b3c8
RO
952 return ERR_PTR(-ENOMEM);
953 }
19baf839
RO
954}
955
772cb712 956/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
957 via get_fa_head and dump */
958
772cb712 959static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
19baf839 960{
772cb712 961 struct hlist_head *head = &l->list;
19baf839
RO
962 struct hlist_node *node;
963 struct leaf_info *li;
964
2373ce1c 965 hlist_for_each_entry_rcu(li, node, head, hlist)
c877efb2 966 if (li->plen == plen)
19baf839 967 return li;
91b9a277 968
19baf839
RO
969 return NULL;
970}
971
a07f5f50 972static inline struct list_head *get_fa_head(struct leaf *l, int plen)
19baf839 973{
772cb712 974 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 975
91b9a277
OJ
976 if (!li)
977 return NULL;
c877efb2 978
91b9a277 979 return &li->falh;
19baf839
RO
980}
981
982static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
983{
e905a9ed
YH
984 struct leaf_info *li = NULL, *last = NULL;
985 struct hlist_node *node;
986
987 if (hlist_empty(head)) {
988 hlist_add_head_rcu(&new->hlist, head);
989 } else {
990 hlist_for_each_entry(li, node, head, hlist) {
991 if (new->plen > li->plen)
992 break;
993
994 last = li;
995 }
996 if (last)
997 hlist_add_after_rcu(&last->hlist, &new->hlist);
998 else
999 hlist_add_before_rcu(&new->hlist, &li->hlist);
1000 }
19baf839
RO
1001}
1002
2373ce1c
RO
1003/* rcu_read_lock needs to be hold by caller from readside */
1004
19baf839
RO
1005static struct leaf *
1006fib_find_node(struct trie *t, u32 key)
1007{
1008 int pos;
1009 struct tnode *tn;
1010 struct node *n;
1011
1012 pos = 0;
2373ce1c 1013 n = rcu_dereference(t->trie);
19baf839
RO
1014
1015 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1016 tn = (struct tnode *) n;
91b9a277 1017
19baf839 1018 check_tnode(tn);
91b9a277 1019
c877efb2 1020 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 1021 pos = tn->pos + tn->bits;
a07f5f50
SH
1022 n = tnode_get_child_rcu(tn,
1023 tkey_extract_bits(key,
1024 tn->pos,
1025 tn->bits));
91b9a277 1026 } else
19baf839
RO
1027 break;
1028 }
1029 /* Case we have found a leaf. Compare prefixes */
1030
91b9a277
OJ
1031 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
1032 return (struct leaf *)n;
1033
19baf839
RO
1034 return NULL;
1035}
1036
7b85576d 1037static void trie_rebalance(struct trie *t, struct tnode *tn)
19baf839 1038{
19baf839 1039 int wasfull;
3ed18d76 1040 t_key cindex, key;
06801916 1041 struct tnode *tp;
19baf839 1042
3ed18d76
RO
1043 key = tn->key;
1044
06801916 1045 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
19baf839
RO
1046 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1047 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
a07f5f50
SH
1048 tn = (struct tnode *) resize(t, (struct tnode *)tn);
1049
1050 tnode_put_child_reorg((struct tnode *)tp, cindex,
1051 (struct node *)tn, wasfull);
91b9a277 1052
06801916 1053 tp = node_parent((struct node *) tn);
008440e3
JP
1054 if (!tp)
1055 rcu_assign_pointer(t->trie, (struct node *)tn);
1056
e0f7cb8c 1057 tnode_free_flush();
06801916 1058 if (!tp)
19baf839 1059 break;
06801916 1060 tn = tp;
19baf839 1061 }
06801916 1062
19baf839 1063 /* Handle last (top) tnode */
7b85576d 1064 if (IS_TNODE(tn))
a07f5f50 1065 tn = (struct tnode *)resize(t, (struct tnode *)tn);
19baf839 1066
7b85576d
JP
1067 rcu_assign_pointer(t->trie, (struct node *)tn);
1068 tnode_free_flush();
1069
1070 return;
19baf839
RO
1071}
1072
2373ce1c
RO
1073/* only used from updater-side */
1074
fea86ad8 1075static struct list_head *fib_insert_node(struct trie *t, u32 key, int plen)
19baf839
RO
1076{
1077 int pos, newpos;
1078 struct tnode *tp = NULL, *tn = NULL;
1079 struct node *n;
1080 struct leaf *l;
1081 int missbit;
c877efb2 1082 struct list_head *fa_head = NULL;
19baf839
RO
1083 struct leaf_info *li;
1084 t_key cindex;
1085
1086 pos = 0;
c877efb2 1087 n = t->trie;
19baf839 1088
c877efb2
SH
1089 /* If we point to NULL, stop. Either the tree is empty and we should
1090 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 1091 * and we should just put our new leaf in that.
c877efb2
SH
1092 * If we point to a T_TNODE, check if it matches our key. Note that
1093 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1094 * not be the parent's 'pos'+'bits'!
1095 *
c877efb2 1096 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1097 * the index from our key, push the T_TNODE and walk the tree.
1098 *
1099 * If it doesn't, we have to replace it with a new T_TNODE.
1100 *
c877efb2
SH
1101 * If we point to a T_LEAF, it might or might not have the same key
1102 * as we do. If it does, just change the value, update the T_LEAF's
1103 * value, and return it.
19baf839
RO
1104 * If it doesn't, we need to replace it with a T_TNODE.
1105 */
1106
1107 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1108 tn = (struct tnode *) n;
91b9a277 1109
c877efb2 1110 check_tnode(tn);
91b9a277 1111
c877efb2 1112 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1113 tp = tn;
91b9a277 1114 pos = tn->pos + tn->bits;
a07f5f50
SH
1115 n = tnode_get_child(tn,
1116 tkey_extract_bits(key,
1117 tn->pos,
1118 tn->bits));
19baf839 1119
06801916 1120 BUG_ON(n && node_parent(n) != tn);
91b9a277 1121 } else
19baf839
RO
1122 break;
1123 }
1124
1125 /*
1126 * n ----> NULL, LEAF or TNODE
1127 *
c877efb2 1128 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1129 */
1130
91b9a277 1131 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1132
1133 /* Case 1: n is a leaf. Compare prefixes */
1134
c877efb2 1135 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
c95aaf9a 1136 l = (struct leaf *) n;
19baf839 1137 li = leaf_info_new(plen);
91b9a277 1138
fea86ad8
SH
1139 if (!li)
1140 return NULL;
19baf839
RO
1141
1142 fa_head = &li->falh;
1143 insert_leaf_info(&l->list, li);
1144 goto done;
1145 }
19baf839
RO
1146 l = leaf_new();
1147
fea86ad8
SH
1148 if (!l)
1149 return NULL;
19baf839
RO
1150
1151 l->key = key;
1152 li = leaf_info_new(plen);
1153
c877efb2 1154 if (!li) {
387a5487 1155 free_leaf(l);
fea86ad8 1156 return NULL;
f835e471 1157 }
19baf839
RO
1158
1159 fa_head = &li->falh;
1160 insert_leaf_info(&l->list, li);
1161
19baf839 1162 if (t->trie && n == NULL) {
91b9a277 1163 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1164
06801916 1165 node_set_parent((struct node *)l, tp);
19baf839 1166
91b9a277
OJ
1167 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1168 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1169 } else {
1170 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1171 /*
1172 * Add a new tnode here
19baf839
RO
1173 * first tnode need some special handling
1174 */
1175
1176 if (tp)
91b9a277 1177 pos = tp->pos+tp->bits;
19baf839 1178 else
91b9a277
OJ
1179 pos = 0;
1180
c877efb2 1181 if (n) {
19baf839
RO
1182 newpos = tkey_mismatch(key, pos, n->key);
1183 tn = tnode_new(n->key, newpos, 1);
91b9a277 1184 } else {
19baf839 1185 newpos = 0;
c877efb2 1186 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1187 }
19baf839 1188
c877efb2 1189 if (!tn) {
f835e471 1190 free_leaf_info(li);
387a5487 1191 free_leaf(l);
fea86ad8 1192 return NULL;
91b9a277
OJ
1193 }
1194
06801916 1195 node_set_parent((struct node *)tn, tp);
19baf839 1196
91b9a277 1197 missbit = tkey_extract_bits(key, newpos, 1);
19baf839
RO
1198 put_child(t, tn, missbit, (struct node *)l);
1199 put_child(t, tn, 1-missbit, n);
1200
c877efb2 1201 if (tp) {
19baf839 1202 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
a07f5f50
SH
1203 put_child(t, (struct tnode *)tp, cindex,
1204 (struct node *)tn);
91b9a277 1205 } else {
a07f5f50 1206 rcu_assign_pointer(t->trie, (struct node *)tn);
19baf839
RO
1207 tp = tn;
1208 }
1209 }
91b9a277
OJ
1210
1211 if (tp && tp->pos + tp->bits > 32)
a07f5f50
SH
1212 pr_warning("fib_trie"
1213 " tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
1214 tp, tp->pos, tp->bits, key, plen);
91b9a277 1215
19baf839 1216 /* Rebalance the trie */
2373ce1c 1217
7b85576d 1218 trie_rebalance(t, tp);
f835e471 1219done:
19baf839
RO
1220 return fa_head;
1221}
1222
d562f1f8
RO
1223/*
1224 * Caller must hold RTNL.
1225 */
4e902c57 1226static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1227{
1228 struct trie *t = (struct trie *) tb->tb_data;
1229 struct fib_alias *fa, *new_fa;
c877efb2 1230 struct list_head *fa_head = NULL;
19baf839 1231 struct fib_info *fi;
4e902c57
TG
1232 int plen = cfg->fc_dst_len;
1233 u8 tos = cfg->fc_tos;
19baf839
RO
1234 u32 key, mask;
1235 int err;
1236 struct leaf *l;
1237
1238 if (plen > 32)
1239 return -EINVAL;
1240
4e902c57 1241 key = ntohl(cfg->fc_dst);
19baf839 1242
2dfe55b4 1243 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1244
91b9a277 1245 mask = ntohl(inet_make_mask(plen));
19baf839 1246
c877efb2 1247 if (key & ~mask)
19baf839
RO
1248 return -EINVAL;
1249
1250 key = key & mask;
1251
4e902c57
TG
1252 fi = fib_create_info(cfg);
1253 if (IS_ERR(fi)) {
1254 err = PTR_ERR(fi);
19baf839 1255 goto err;
4e902c57 1256 }
19baf839
RO
1257
1258 l = fib_find_node(t, key);
c877efb2 1259 fa = NULL;
19baf839 1260
c877efb2 1261 if (l) {
19baf839
RO
1262 fa_head = get_fa_head(l, plen);
1263 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1264 }
1265
1266 /* Now fa, if non-NULL, points to the first fib alias
1267 * with the same keys [prefix,tos,priority], if such key already
1268 * exists or to the node before which we will insert new one.
1269 *
1270 * If fa is NULL, we will need to allocate a new one and
1271 * insert to the head of f.
1272 *
1273 * If f is NULL, no fib node matched the destination key
1274 * and we need to allocate a new one of those as well.
1275 */
1276
936f6f8e
JA
1277 if (fa && fa->fa_tos == tos &&
1278 fa->fa_info->fib_priority == fi->fib_priority) {
1279 struct fib_alias *fa_first, *fa_match;
19baf839
RO
1280
1281 err = -EEXIST;
4e902c57 1282 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1283 goto out;
1284
936f6f8e
JA
1285 /* We have 2 goals:
1286 * 1. Find exact match for type, scope, fib_info to avoid
1287 * duplicate routes
1288 * 2. Find next 'fa' (or head), NLM_F_APPEND inserts before it
1289 */
1290 fa_match = NULL;
1291 fa_first = fa;
1292 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1293 list_for_each_entry_continue(fa, fa_head, fa_list) {
1294 if (fa->fa_tos != tos)
1295 break;
1296 if (fa->fa_info->fib_priority != fi->fib_priority)
1297 break;
1298 if (fa->fa_type == cfg->fc_type &&
1299 fa->fa_scope == cfg->fc_scope &&
1300 fa->fa_info == fi) {
1301 fa_match = fa;
1302 break;
1303 }
1304 }
1305
4e902c57 1306 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1307 struct fib_info *fi_drop;
1308 u8 state;
1309
936f6f8e
JA
1310 fa = fa_first;
1311 if (fa_match) {
1312 if (fa == fa_match)
1313 err = 0;
6725033f 1314 goto out;
936f6f8e 1315 }
2373ce1c 1316 err = -ENOBUFS;
e94b1766 1317 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1318 if (new_fa == NULL)
1319 goto out;
19baf839
RO
1320
1321 fi_drop = fa->fa_info;
2373ce1c
RO
1322 new_fa->fa_tos = fa->fa_tos;
1323 new_fa->fa_info = fi;
4e902c57
TG
1324 new_fa->fa_type = cfg->fc_type;
1325 new_fa->fa_scope = cfg->fc_scope;
19baf839 1326 state = fa->fa_state;
936f6f8e 1327 new_fa->fa_state = state & ~FA_S_ACCESSED;
19baf839 1328
2373ce1c
RO
1329 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1330 alias_free_mem_rcu(fa);
19baf839
RO
1331
1332 fib_release_info(fi_drop);
1333 if (state & FA_S_ACCESSED)
76e6ebfb 1334 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
b8f55831
MK
1335 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1336 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1337
91b9a277 1338 goto succeeded;
19baf839
RO
1339 }
1340 /* Error if we find a perfect match which
1341 * uses the same scope, type, and nexthop
1342 * information.
1343 */
936f6f8e
JA
1344 if (fa_match)
1345 goto out;
a07f5f50 1346
4e902c57 1347 if (!(cfg->fc_nlflags & NLM_F_APPEND))
936f6f8e 1348 fa = fa_first;
19baf839
RO
1349 }
1350 err = -ENOENT;
4e902c57 1351 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1352 goto out;
1353
1354 err = -ENOBUFS;
e94b1766 1355 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1356 if (new_fa == NULL)
1357 goto out;
1358
1359 new_fa->fa_info = fi;
1360 new_fa->fa_tos = tos;
4e902c57
TG
1361 new_fa->fa_type = cfg->fc_type;
1362 new_fa->fa_scope = cfg->fc_scope;
19baf839 1363 new_fa->fa_state = 0;
19baf839
RO
1364 /*
1365 * Insert new entry to the list.
1366 */
1367
c877efb2 1368 if (!fa_head) {
fea86ad8
SH
1369 fa_head = fib_insert_node(t, key, plen);
1370 if (unlikely(!fa_head)) {
1371 err = -ENOMEM;
f835e471 1372 goto out_free_new_fa;
fea86ad8 1373 }
f835e471 1374 }
19baf839 1375
2373ce1c
RO
1376 list_add_tail_rcu(&new_fa->fa_list,
1377 (fa ? &fa->fa_list : fa_head));
19baf839 1378
76e6ebfb 1379 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
4e902c57 1380 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1381 &cfg->fc_nlinfo, 0);
19baf839
RO
1382succeeded:
1383 return 0;
f835e471
RO
1384
1385out_free_new_fa:
1386 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1387out:
1388 fib_release_info(fi);
91b9a277 1389err:
19baf839
RO
1390 return err;
1391}
1392
772cb712 1393/* should be called with rcu_read_lock */
a07f5f50
SH
1394static int check_leaf(struct trie *t, struct leaf *l,
1395 t_key key, const struct flowi *flp,
1396 struct fib_result *res)
19baf839 1397{
19baf839
RO
1398 struct leaf_info *li;
1399 struct hlist_head *hhead = &l->list;
1400 struct hlist_node *node;
c877efb2 1401
2373ce1c 1402 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
a07f5f50
SH
1403 int err;
1404 int plen = li->plen;
1405 __be32 mask = inet_make_mask(plen);
1406
888454c5 1407 if (l->key != (key & ntohl(mask)))
19baf839
RO
1408 continue;
1409
e204a345 1410 err = fib_semantic_match(&li->falh, flp, res, plen);
a07f5f50 1411
19baf839 1412#ifdef CONFIG_IP_FIB_TRIE_STATS
a07f5f50 1413 if (err <= 0)
19baf839 1414 t->stats.semantic_match_passed++;
a07f5f50
SH
1415 else
1416 t->stats.semantic_match_miss++;
19baf839 1417#endif
a07f5f50 1418 if (err <= 0)
2e655571 1419 return err;
19baf839 1420 }
a07f5f50 1421
2e655571 1422 return 1;
19baf839
RO
1423}
1424
a07f5f50
SH
1425static int fn_trie_lookup(struct fib_table *tb, const struct flowi *flp,
1426 struct fib_result *res)
19baf839
RO
1427{
1428 struct trie *t = (struct trie *) tb->tb_data;
2e655571 1429 int ret;
19baf839
RO
1430 struct node *n;
1431 struct tnode *pn;
1432 int pos, bits;
91b9a277 1433 t_key key = ntohl(flp->fl4_dst);
19baf839
RO
1434 int chopped_off;
1435 t_key cindex = 0;
1436 int current_prefix_length = KEYLENGTH;
91b9a277
OJ
1437 struct tnode *cn;
1438 t_key node_prefix, key_prefix, pref_mismatch;
1439 int mp;
1440
2373ce1c 1441 rcu_read_lock();
91b9a277 1442
2373ce1c 1443 n = rcu_dereference(t->trie);
c877efb2 1444 if (!n)
19baf839
RO
1445 goto failed;
1446
1447#ifdef CONFIG_IP_FIB_TRIE_STATS
1448 t->stats.gets++;
1449#endif
1450
1451 /* Just a leaf? */
1452 if (IS_LEAF(n)) {
2e655571 1453 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
a07f5f50 1454 goto found;
19baf839 1455 }
a07f5f50 1456
19baf839
RO
1457 pn = (struct tnode *) n;
1458 chopped_off = 0;
c877efb2 1459
91b9a277 1460 while (pn) {
19baf839
RO
1461 pos = pn->pos;
1462 bits = pn->bits;
1463
c877efb2 1464 if (!chopped_off)
ab66b4a7
SH
1465 cindex = tkey_extract_bits(mask_pfx(key, current_prefix_length),
1466 pos, bits);
19baf839 1467
b902e573 1468 n = tnode_get_child_rcu(pn, cindex);
19baf839
RO
1469
1470 if (n == NULL) {
1471#ifdef CONFIG_IP_FIB_TRIE_STATS
1472 t->stats.null_node_hit++;
1473#endif
1474 goto backtrace;
1475 }
1476
91b9a277 1477 if (IS_LEAF(n)) {
2e655571
BH
1478 ret = check_leaf(t, (struct leaf *)n, key, flp, res);
1479 if (ret > 0)
91b9a277 1480 goto backtrace;
a07f5f50 1481 goto found;
91b9a277
OJ
1482 }
1483
91b9a277 1484 cn = (struct tnode *)n;
19baf839 1485
91b9a277
OJ
1486 /*
1487 * It's a tnode, and we can do some extra checks here if we
1488 * like, to avoid descending into a dead-end branch.
1489 * This tnode is in the parent's child array at index
1490 * key[p_pos..p_pos+p_bits] but potentially with some bits
1491 * chopped off, so in reality the index may be just a
1492 * subprefix, padded with zero at the end.
1493 * We can also take a look at any skipped bits in this
1494 * tnode - everything up to p_pos is supposed to be ok,
1495 * and the non-chopped bits of the index (se previous
1496 * paragraph) are also guaranteed ok, but the rest is
1497 * considered unknown.
1498 *
1499 * The skipped bits are key[pos+bits..cn->pos].
1500 */
19baf839 1501
91b9a277
OJ
1502 /* If current_prefix_length < pos+bits, we are already doing
1503 * actual prefix matching, which means everything from
1504 * pos+(bits-chopped_off) onward must be zero along some
1505 * branch of this subtree - otherwise there is *no* valid
1506 * prefix present. Here we can only check the skipped
1507 * bits. Remember, since we have already indexed into the
1508 * parent's child array, we know that the bits we chopped of
1509 * *are* zero.
1510 */
19baf839 1511
a07f5f50
SH
1512 /* NOTA BENE: Checking only skipped bits
1513 for the new node here */
19baf839 1514
91b9a277
OJ
1515 if (current_prefix_length < pos+bits) {
1516 if (tkey_extract_bits(cn->key, current_prefix_length,
a07f5f50
SH
1517 cn->pos - current_prefix_length)
1518 || !(cn->child[0]))
91b9a277
OJ
1519 goto backtrace;
1520 }
19baf839 1521
91b9a277
OJ
1522 /*
1523 * If chopped_off=0, the index is fully validated and we
1524 * only need to look at the skipped bits for this, the new,
1525 * tnode. What we actually want to do is to find out if
1526 * these skipped bits match our key perfectly, or if we will
1527 * have to count on finding a matching prefix further down,
1528 * because if we do, we would like to have some way of
1529 * verifying the existence of such a prefix at this point.
1530 */
19baf839 1531
91b9a277
OJ
1532 /* The only thing we can do at this point is to verify that
1533 * any such matching prefix can indeed be a prefix to our
1534 * key, and if the bits in the node we are inspecting that
1535 * do not match our key are not ZERO, this cannot be true.
1536 * Thus, find out where there is a mismatch (before cn->pos)
1537 * and verify that all the mismatching bits are zero in the
1538 * new tnode's key.
1539 */
19baf839 1540
a07f5f50
SH
1541 /*
1542 * Note: We aren't very concerned about the piece of
1543 * the key that precede pn->pos+pn->bits, since these
1544 * have already been checked. The bits after cn->pos
1545 * aren't checked since these are by definition
1546 * "unknown" at this point. Thus, what we want to see
1547 * is if we are about to enter the "prefix matching"
1548 * state, and in that case verify that the skipped
1549 * bits that will prevail throughout this subtree are
1550 * zero, as they have to be if we are to find a
1551 * matching prefix.
91b9a277
OJ
1552 */
1553
ab66b4a7
SH
1554 node_prefix = mask_pfx(cn->key, cn->pos);
1555 key_prefix = mask_pfx(key, cn->pos);
91b9a277
OJ
1556 pref_mismatch = key_prefix^node_prefix;
1557 mp = 0;
1558
a07f5f50
SH
1559 /*
1560 * In short: If skipped bits in this node do not match
1561 * the search key, enter the "prefix matching"
1562 * state.directly.
91b9a277
OJ
1563 */
1564 if (pref_mismatch) {
1565 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1566 mp++;
a07f5f50 1567 pref_mismatch = pref_mismatch << 1;
91b9a277
OJ
1568 }
1569 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1570
1571 if (key_prefix != 0)
1572 goto backtrace;
1573
1574 if (current_prefix_length >= cn->pos)
1575 current_prefix_length = mp;
c877efb2 1576 }
a07f5f50 1577
91b9a277
OJ
1578 pn = (struct tnode *)n; /* Descend */
1579 chopped_off = 0;
1580 continue;
1581
19baf839
RO
1582backtrace:
1583 chopped_off++;
1584
1585 /* As zero don't change the child key (cindex) */
a07f5f50
SH
1586 while ((chopped_off <= pn->bits)
1587 && !(cindex & (1<<(chopped_off-1))))
19baf839 1588 chopped_off++;
19baf839
RO
1589
1590 /* Decrease current_... with bits chopped off */
1591 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
a07f5f50
SH
1592 current_prefix_length = pn->pos + pn->bits
1593 - chopped_off;
91b9a277 1594
19baf839 1595 /*
c877efb2 1596 * Either we do the actual chop off according or if we have
19baf839
RO
1597 * chopped off all bits in this tnode walk up to our parent.
1598 */
1599
91b9a277 1600 if (chopped_off <= pn->bits) {
19baf839 1601 cindex &= ~(1 << (chopped_off-1));
91b9a277 1602 } else {
b902e573 1603 struct tnode *parent = node_parent_rcu((struct node *) pn);
06801916 1604 if (!parent)
19baf839 1605 goto failed;
91b9a277 1606
19baf839 1607 /* Get Child's index */
06801916
SH
1608 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1609 pn = parent;
19baf839
RO
1610 chopped_off = 0;
1611
1612#ifdef CONFIG_IP_FIB_TRIE_STATS
1613 t->stats.backtrack++;
1614#endif
1615 goto backtrace;
c877efb2 1616 }
19baf839
RO
1617 }
1618failed:
c877efb2 1619 ret = 1;
19baf839 1620found:
2373ce1c 1621 rcu_read_unlock();
19baf839
RO
1622 return ret;
1623}
1624
9195bef7
SH
1625/*
1626 * Remove the leaf and return parent.
1627 */
1628static void trie_leaf_remove(struct trie *t, struct leaf *l)
19baf839 1629{
9195bef7 1630 struct tnode *tp = node_parent((struct node *) l);
c877efb2 1631
9195bef7 1632 pr_debug("entering trie_leaf_remove(%p)\n", l);
19baf839 1633
c877efb2 1634 if (tp) {
9195bef7 1635 t_key cindex = tkey_extract_bits(l->key, tp->pos, tp->bits);
19baf839 1636 put_child(t, (struct tnode *)tp, cindex, NULL);
7b85576d 1637 trie_rebalance(t, tp);
91b9a277 1638 } else
2373ce1c 1639 rcu_assign_pointer(t->trie, NULL);
19baf839 1640
387a5487 1641 free_leaf(l);
19baf839
RO
1642}
1643
d562f1f8
RO
1644/*
1645 * Caller must hold RTNL.
1646 */
4e902c57 1647static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1648{
1649 struct trie *t = (struct trie *) tb->tb_data;
1650 u32 key, mask;
4e902c57
TG
1651 int plen = cfg->fc_dst_len;
1652 u8 tos = cfg->fc_tos;
19baf839
RO
1653 struct fib_alias *fa, *fa_to_delete;
1654 struct list_head *fa_head;
1655 struct leaf *l;
91b9a277
OJ
1656 struct leaf_info *li;
1657
c877efb2 1658 if (plen > 32)
19baf839
RO
1659 return -EINVAL;
1660
4e902c57 1661 key = ntohl(cfg->fc_dst);
91b9a277 1662 mask = ntohl(inet_make_mask(plen));
19baf839 1663
c877efb2 1664 if (key & ~mask)
19baf839
RO
1665 return -EINVAL;
1666
1667 key = key & mask;
1668 l = fib_find_node(t, key);
1669
c877efb2 1670 if (!l)
19baf839
RO
1671 return -ESRCH;
1672
1673 fa_head = get_fa_head(l, plen);
1674 fa = fib_find_alias(fa_head, tos, 0);
1675
1676 if (!fa)
1677 return -ESRCH;
1678
0c7770c7 1679 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1680
1681 fa_to_delete = NULL;
936f6f8e
JA
1682 fa = list_entry(fa->fa_list.prev, struct fib_alias, fa_list);
1683 list_for_each_entry_continue(fa, fa_head, fa_list) {
19baf839
RO
1684 struct fib_info *fi = fa->fa_info;
1685
1686 if (fa->fa_tos != tos)
1687 break;
1688
4e902c57
TG
1689 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1690 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1691 fa->fa_scope == cfg->fc_scope) &&
1692 (!cfg->fc_protocol ||
1693 fi->fib_protocol == cfg->fc_protocol) &&
1694 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1695 fa_to_delete = fa;
1696 break;
1697 }
1698 }
1699
91b9a277
OJ
1700 if (!fa_to_delete)
1701 return -ESRCH;
19baf839 1702
91b9a277 1703 fa = fa_to_delete;
4e902c57 1704 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1705 &cfg->fc_nlinfo, 0);
91b9a277
OJ
1706
1707 l = fib_find_node(t, key);
772cb712 1708 li = find_leaf_info(l, plen);
19baf839 1709
2373ce1c 1710 list_del_rcu(&fa->fa_list);
19baf839 1711
91b9a277 1712 if (list_empty(fa_head)) {
2373ce1c 1713 hlist_del_rcu(&li->hlist);
91b9a277 1714 free_leaf_info(li);
2373ce1c 1715 }
19baf839 1716
91b9a277 1717 if (hlist_empty(&l->list))
9195bef7 1718 trie_leaf_remove(t, l);
19baf839 1719
91b9a277 1720 if (fa->fa_state & FA_S_ACCESSED)
76e6ebfb 1721 rt_cache_flush(cfg->fc_nlinfo.nl_net, -1);
19baf839 1722
2373ce1c
RO
1723 fib_release_info(fa->fa_info);
1724 alias_free_mem_rcu(fa);
91b9a277 1725 return 0;
19baf839
RO
1726}
1727
ef3660ce 1728static int trie_flush_list(struct list_head *head)
19baf839
RO
1729{
1730 struct fib_alias *fa, *fa_node;
1731 int found = 0;
1732
1733 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1734 struct fib_info *fi = fa->fa_info;
19baf839 1735
2373ce1c
RO
1736 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1737 list_del_rcu(&fa->fa_list);
1738 fib_release_info(fa->fa_info);
1739 alias_free_mem_rcu(fa);
19baf839
RO
1740 found++;
1741 }
1742 }
1743 return found;
1744}
1745
ef3660ce 1746static int trie_flush_leaf(struct leaf *l)
19baf839
RO
1747{
1748 int found = 0;
1749 struct hlist_head *lih = &l->list;
1750 struct hlist_node *node, *tmp;
1751 struct leaf_info *li = NULL;
1752
1753 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
ef3660ce 1754 found += trie_flush_list(&li->falh);
19baf839
RO
1755
1756 if (list_empty(&li->falh)) {
2373ce1c 1757 hlist_del_rcu(&li->hlist);
19baf839
RO
1758 free_leaf_info(li);
1759 }
1760 }
1761 return found;
1762}
1763
82cfbb00
SH
1764/*
1765 * Scan for the next right leaf starting at node p->child[idx]
1766 * Since we have back pointer, no recursion necessary.
1767 */
1768static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c)
19baf839 1769{
82cfbb00
SH
1770 do {
1771 t_key idx;
c877efb2 1772
c877efb2 1773 if (c)
82cfbb00 1774 idx = tkey_extract_bits(c->key, p->pos, p->bits) + 1;
c877efb2 1775 else
82cfbb00 1776 idx = 0;
2373ce1c 1777
82cfbb00
SH
1778 while (idx < 1u << p->bits) {
1779 c = tnode_get_child_rcu(p, idx++);
2373ce1c 1780 if (!c)
91b9a277
OJ
1781 continue;
1782
82cfbb00
SH
1783 if (IS_LEAF(c)) {
1784 prefetch(p->child[idx]);
1785 return (struct leaf *) c;
19baf839 1786 }
82cfbb00
SH
1787
1788 /* Rescan start scanning in new node */
1789 p = (struct tnode *) c;
1790 idx = 0;
19baf839 1791 }
82cfbb00
SH
1792
1793 /* Node empty, walk back up to parent */
91b9a277 1794 c = (struct node *) p;
82cfbb00
SH
1795 } while ( (p = node_parent_rcu(c)) != NULL);
1796
1797 return NULL; /* Root of trie */
1798}
1799
82cfbb00
SH
1800static struct leaf *trie_firstleaf(struct trie *t)
1801{
1802 struct tnode *n = (struct tnode *) rcu_dereference(t->trie);
1803
1804 if (!n)
1805 return NULL;
1806
1807 if (IS_LEAF(n)) /* trie is just a leaf */
1808 return (struct leaf *) n;
1809
1810 return leaf_walk_rcu(n, NULL);
1811}
1812
1813static struct leaf *trie_nextleaf(struct leaf *l)
1814{
1815 struct node *c = (struct node *) l;
b902e573 1816 struct tnode *p = node_parent_rcu(c);
82cfbb00
SH
1817
1818 if (!p)
1819 return NULL; /* trie with just one leaf */
1820
1821 return leaf_walk_rcu(p, c);
19baf839
RO
1822}
1823
71d67e66
SH
1824static struct leaf *trie_leafindex(struct trie *t, int index)
1825{
1826 struct leaf *l = trie_firstleaf(t);
1827
ec28cf73 1828 while (l && index-- > 0)
71d67e66 1829 l = trie_nextleaf(l);
ec28cf73 1830
71d67e66
SH
1831 return l;
1832}
1833
1834
d562f1f8
RO
1835/*
1836 * Caller must hold RTNL.
1837 */
19baf839
RO
1838static int fn_trie_flush(struct fib_table *tb)
1839{
1840 struct trie *t = (struct trie *) tb->tb_data;
9195bef7 1841 struct leaf *l, *ll = NULL;
82cfbb00 1842 int found = 0;
19baf839 1843
82cfbb00 1844 for (l = trie_firstleaf(t); l; l = trie_nextleaf(l)) {
ef3660ce 1845 found += trie_flush_leaf(l);
19baf839
RO
1846
1847 if (ll && hlist_empty(&ll->list))
9195bef7 1848 trie_leaf_remove(t, ll);
19baf839
RO
1849 ll = l;
1850 }
1851
1852 if (ll && hlist_empty(&ll->list))
9195bef7 1853 trie_leaf_remove(t, ll);
19baf839 1854
0c7770c7 1855 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1856 return found;
1857}
1858
a07f5f50
SH
1859static void fn_trie_select_default(struct fib_table *tb,
1860 const struct flowi *flp,
1861 struct fib_result *res)
19baf839
RO
1862{
1863 struct trie *t = (struct trie *) tb->tb_data;
1864 int order, last_idx;
1865 struct fib_info *fi = NULL;
1866 struct fib_info *last_resort;
1867 struct fib_alias *fa = NULL;
1868 struct list_head *fa_head;
1869 struct leaf *l;
1870
1871 last_idx = -1;
1872 last_resort = NULL;
1873 order = -1;
1874
2373ce1c 1875 rcu_read_lock();
c877efb2 1876
19baf839 1877 l = fib_find_node(t, 0);
c877efb2 1878 if (!l)
19baf839
RO
1879 goto out;
1880
1881 fa_head = get_fa_head(l, 0);
c877efb2 1882 if (!fa_head)
19baf839
RO
1883 goto out;
1884
c877efb2 1885 if (list_empty(fa_head))
19baf839
RO
1886 goto out;
1887
2373ce1c 1888 list_for_each_entry_rcu(fa, fa_head, fa_list) {
19baf839 1889 struct fib_info *next_fi = fa->fa_info;
91b9a277 1890
19baf839
RO
1891 if (fa->fa_scope != res->scope ||
1892 fa->fa_type != RTN_UNICAST)
1893 continue;
91b9a277 1894
19baf839
RO
1895 if (next_fi->fib_priority > res->fi->fib_priority)
1896 break;
1897 if (!next_fi->fib_nh[0].nh_gw ||
1898 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1899 continue;
1900 fa->fa_state |= FA_S_ACCESSED;
91b9a277 1901
19baf839
RO
1902 if (fi == NULL) {
1903 if (next_fi != res->fi)
1904 break;
1905 } else if (!fib_detect_death(fi, order, &last_resort,
971b893e 1906 &last_idx, tb->tb_default)) {
a2bbe682 1907 fib_result_assign(res, fi);
971b893e 1908 tb->tb_default = order;
19baf839
RO
1909 goto out;
1910 }
1911 fi = next_fi;
1912 order++;
1913 }
1914 if (order <= 0 || fi == NULL) {
971b893e 1915 tb->tb_default = -1;
19baf839
RO
1916 goto out;
1917 }
1918
971b893e
DL
1919 if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1920 tb->tb_default)) {
a2bbe682 1921 fib_result_assign(res, fi);
971b893e 1922 tb->tb_default = order;
19baf839
RO
1923 goto out;
1924 }
a2bbe682
DL
1925 if (last_idx >= 0)
1926 fib_result_assign(res, last_resort);
971b893e
DL
1927 tb->tb_default = last_idx;
1928out:
2373ce1c 1929 rcu_read_unlock();
19baf839
RO
1930}
1931
a07f5f50
SH
1932static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah,
1933 struct fib_table *tb,
19baf839
RO
1934 struct sk_buff *skb, struct netlink_callback *cb)
1935{
1936 int i, s_i;
1937 struct fib_alias *fa;
32ab5f80 1938 __be32 xkey = htonl(key);
19baf839 1939
71d67e66 1940 s_i = cb->args[5];
19baf839
RO
1941 i = 0;
1942
2373ce1c
RO
1943 /* rcu_read_lock is hold by caller */
1944
1945 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1946 if (i < s_i) {
1947 i++;
1948 continue;
1949 }
19baf839
RO
1950
1951 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1952 cb->nlh->nlmsg_seq,
1953 RTM_NEWROUTE,
1954 tb->tb_id,
1955 fa->fa_type,
1956 fa->fa_scope,
be403ea1 1957 xkey,
19baf839
RO
1958 plen,
1959 fa->fa_tos,
64347f78 1960 fa->fa_info, NLM_F_MULTI) < 0) {
71d67e66 1961 cb->args[5] = i;
19baf839 1962 return -1;
91b9a277 1963 }
19baf839
RO
1964 i++;
1965 }
71d67e66 1966 cb->args[5] = i;
19baf839
RO
1967 return skb->len;
1968}
1969
a88ee229
SH
1970static int fn_trie_dump_leaf(struct leaf *l, struct fib_table *tb,
1971 struct sk_buff *skb, struct netlink_callback *cb)
19baf839 1972{
a88ee229
SH
1973 struct leaf_info *li;
1974 struct hlist_node *node;
1975 int i, s_i;
19baf839 1976
71d67e66 1977 s_i = cb->args[4];
a88ee229 1978 i = 0;
19baf839 1979
a88ee229
SH
1980 /* rcu_read_lock is hold by caller */
1981 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
1982 if (i < s_i) {
1983 i++;
19baf839 1984 continue;
a88ee229 1985 }
91b9a277 1986
a88ee229 1987 if (i > s_i)
71d67e66 1988 cb->args[5] = 0;
19baf839 1989
a88ee229 1990 if (list_empty(&li->falh))
19baf839
RO
1991 continue;
1992
a88ee229 1993 if (fn_trie_dump_fa(l->key, li->plen, &li->falh, tb, skb, cb) < 0) {
71d67e66 1994 cb->args[4] = i;
19baf839
RO
1995 return -1;
1996 }
a88ee229 1997 i++;
19baf839 1998 }
a88ee229 1999
71d67e66 2000 cb->args[4] = i;
19baf839
RO
2001 return skb->len;
2002}
2003
a07f5f50
SH
2004static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb,
2005 struct netlink_callback *cb)
19baf839 2006{
a88ee229 2007 struct leaf *l;
19baf839 2008 struct trie *t = (struct trie *) tb->tb_data;
d5ce8a0e 2009 t_key key = cb->args[2];
71d67e66 2010 int count = cb->args[3];
19baf839 2011
2373ce1c 2012 rcu_read_lock();
d5ce8a0e
SH
2013 /* Dump starting at last key.
2014 * Note: 0.0.0.0/0 (ie default) is first key.
2015 */
71d67e66 2016 if (count == 0)
d5ce8a0e
SH
2017 l = trie_firstleaf(t);
2018 else {
71d67e66
SH
2019 /* Normally, continue from last key, but if that is missing
2020 * fallback to using slow rescan
2021 */
d5ce8a0e 2022 l = fib_find_node(t, key);
71d67e66
SH
2023 if (!l)
2024 l = trie_leafindex(t, count);
d5ce8a0e 2025 }
a88ee229 2026
d5ce8a0e
SH
2027 while (l) {
2028 cb->args[2] = l->key;
a88ee229 2029 if (fn_trie_dump_leaf(l, tb, skb, cb) < 0) {
71d67e66 2030 cb->args[3] = count;
a88ee229 2031 rcu_read_unlock();
a88ee229 2032 return -1;
19baf839 2033 }
d5ce8a0e 2034
71d67e66 2035 ++count;
d5ce8a0e 2036 l = trie_nextleaf(l);
71d67e66
SH
2037 memset(&cb->args[4], 0,
2038 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839 2039 }
71d67e66 2040 cb->args[3] = count;
2373ce1c 2041 rcu_read_unlock();
a88ee229 2042
19baf839 2043 return skb->len;
19baf839
RO
2044}
2045
7f9b8052
SH
2046void __init fib_hash_init(void)
2047{
a07f5f50
SH
2048 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
2049 sizeof(struct fib_alias),
bc3c8c1e
SH
2050 0, SLAB_PANIC, NULL);
2051
2052 trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
2053 max(sizeof(struct leaf),
2054 sizeof(struct leaf_info)),
2055 0, SLAB_PANIC, NULL);
7f9b8052 2056}
19baf839 2057
7f9b8052
SH
2058
2059/* Fix more generic FIB names for init later */
2060struct fib_table *fib_hash_table(u32 id)
19baf839
RO
2061{
2062 struct fib_table *tb;
2063 struct trie *t;
2064
19baf839
RO
2065 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
2066 GFP_KERNEL);
2067 if (tb == NULL)
2068 return NULL;
2069
2070 tb->tb_id = id;
971b893e 2071 tb->tb_default = -1;
19baf839
RO
2072 tb->tb_lookup = fn_trie_lookup;
2073 tb->tb_insert = fn_trie_insert;
2074 tb->tb_delete = fn_trie_delete;
2075 tb->tb_flush = fn_trie_flush;
2076 tb->tb_select_default = fn_trie_select_default;
2077 tb->tb_dump = fn_trie_dump;
19baf839
RO
2078
2079 t = (struct trie *) tb->tb_data;
c28a1cf4 2080 memset(t, 0, sizeof(*t));
19baf839 2081
19baf839 2082 if (id == RT_TABLE_LOCAL)
a07f5f50 2083 pr_info("IPv4 FIB: Using LC-trie version %s\n", VERSION);
19baf839
RO
2084
2085 return tb;
2086}
2087
cb7b593c
SH
2088#ifdef CONFIG_PROC_FS
2089/* Depth first Trie walk iterator */
2090struct fib_trie_iter {
1c340b2f 2091 struct seq_net_private p;
3d3b2d25 2092 struct fib_table *tb;
cb7b593c 2093 struct tnode *tnode;
cb7b593c
SH
2094 unsigned index;
2095 unsigned depth;
2096};
19baf839 2097
cb7b593c 2098static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 2099{
cb7b593c
SH
2100 struct tnode *tn = iter->tnode;
2101 unsigned cindex = iter->index;
2102 struct tnode *p;
19baf839 2103
6640e697
EB
2104 /* A single entry routing table */
2105 if (!tn)
2106 return NULL;
2107
cb7b593c
SH
2108 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2109 iter->tnode, iter->index, iter->depth);
2110rescan:
2111 while (cindex < (1<<tn->bits)) {
b59cfbf7 2112 struct node *n = tnode_get_child_rcu(tn, cindex);
19baf839 2113
cb7b593c
SH
2114 if (n) {
2115 if (IS_LEAF(n)) {
2116 iter->tnode = tn;
2117 iter->index = cindex + 1;
2118 } else {
2119 /* push down one level */
2120 iter->tnode = (struct tnode *) n;
2121 iter->index = 0;
2122 ++iter->depth;
2123 }
2124 return n;
2125 }
19baf839 2126
cb7b593c
SH
2127 ++cindex;
2128 }
91b9a277 2129
cb7b593c 2130 /* Current node exhausted, pop back up */
b59cfbf7 2131 p = node_parent_rcu((struct node *)tn);
cb7b593c
SH
2132 if (p) {
2133 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2134 tn = p;
2135 --iter->depth;
2136 goto rescan;
19baf839 2137 }
cb7b593c
SH
2138
2139 /* got root? */
2140 return NULL;
19baf839
RO
2141}
2142
cb7b593c
SH
2143static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2144 struct trie *t)
19baf839 2145{
3d3b2d25 2146 struct node *n;
5ddf0eb2 2147
132adf54 2148 if (!t)
5ddf0eb2
RO
2149 return NULL;
2150
2151 n = rcu_dereference(t->trie);
3d3b2d25 2152 if (!n)
5ddf0eb2 2153 return NULL;
19baf839 2154
3d3b2d25
SH
2155 if (IS_TNODE(n)) {
2156 iter->tnode = (struct tnode *) n;
2157 iter->index = 0;
2158 iter->depth = 1;
2159 } else {
2160 iter->tnode = NULL;
2161 iter->index = 0;
2162 iter->depth = 0;
91b9a277 2163 }
3d3b2d25
SH
2164
2165 return n;
cb7b593c 2166}
91b9a277 2167
cb7b593c
SH
2168static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2169{
2170 struct node *n;
2171 struct fib_trie_iter iter;
91b9a277 2172
cb7b593c 2173 memset(s, 0, sizeof(*s));
91b9a277 2174
cb7b593c 2175 rcu_read_lock();
3d3b2d25 2176 for (n = fib_trie_get_first(&iter, t); n; n = fib_trie_get_next(&iter)) {
cb7b593c 2177 if (IS_LEAF(n)) {
93672292
SH
2178 struct leaf *l = (struct leaf *)n;
2179 struct leaf_info *li;
2180 struct hlist_node *tmp;
2181
cb7b593c
SH
2182 s->leaves++;
2183 s->totdepth += iter.depth;
2184 if (iter.depth > s->maxdepth)
2185 s->maxdepth = iter.depth;
93672292
SH
2186
2187 hlist_for_each_entry_rcu(li, tmp, &l->list, hlist)
2188 ++s->prefixes;
cb7b593c
SH
2189 } else {
2190 const struct tnode *tn = (const struct tnode *) n;
2191 int i;
2192
2193 s->tnodes++;
132adf54 2194 if (tn->bits < MAX_STAT_DEPTH)
06ef921d
RO
2195 s->nodesizes[tn->bits]++;
2196
cb7b593c
SH
2197 for (i = 0; i < (1<<tn->bits); i++)
2198 if (!tn->child[i])
2199 s->nullpointers++;
19baf839 2200 }
19baf839 2201 }
2373ce1c 2202 rcu_read_unlock();
19baf839
RO
2203}
2204
cb7b593c
SH
2205/*
2206 * This outputs /proc/net/fib_triestats
2207 */
2208static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2209{
cb7b593c 2210 unsigned i, max, pointers, bytes, avdepth;
c877efb2 2211
cb7b593c
SH
2212 if (stat->leaves)
2213 avdepth = stat->totdepth*100 / stat->leaves;
2214 else
2215 avdepth = 0;
91b9a277 2216
a07f5f50
SH
2217 seq_printf(seq, "\tAver depth: %u.%02d\n",
2218 avdepth / 100, avdepth % 100);
cb7b593c 2219 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2220
cb7b593c 2221 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
cb7b593c 2222 bytes = sizeof(struct leaf) * stat->leaves;
93672292
SH
2223
2224 seq_printf(seq, "\tPrefixes: %u\n", stat->prefixes);
2225 bytes += sizeof(struct leaf_info) * stat->prefixes;
2226
187b5188 2227 seq_printf(seq, "\tInternal nodes: %u\n\t", stat->tnodes);
cb7b593c 2228 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2229
06ef921d
RO
2230 max = MAX_STAT_DEPTH;
2231 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2232 max--;
19baf839 2233
cb7b593c
SH
2234 pointers = 0;
2235 for (i = 1; i <= max; i++)
2236 if (stat->nodesizes[i] != 0) {
187b5188 2237 seq_printf(seq, " %u: %u", i, stat->nodesizes[i]);
cb7b593c
SH
2238 pointers += (1<<i) * stat->nodesizes[i];
2239 }
2240 seq_putc(seq, '\n');
187b5188 2241 seq_printf(seq, "\tPointers: %u\n", pointers);
2373ce1c 2242
cb7b593c 2243 bytes += sizeof(struct node *) * pointers;
187b5188
SH
2244 seq_printf(seq, "Null ptrs: %u\n", stat->nullpointers);
2245 seq_printf(seq, "Total size: %u kB\n", (bytes + 1023) / 1024);
66a2f7fd 2246}
2373ce1c 2247
cb7b593c 2248#ifdef CONFIG_IP_FIB_TRIE_STATS
66a2f7fd
SH
2249static void trie_show_usage(struct seq_file *seq,
2250 const struct trie_use_stats *stats)
2251{
2252 seq_printf(seq, "\nCounters:\n---------\n");
a07f5f50
SH
2253 seq_printf(seq, "gets = %u\n", stats->gets);
2254 seq_printf(seq, "backtracks = %u\n", stats->backtrack);
2255 seq_printf(seq, "semantic match passed = %u\n",
2256 stats->semantic_match_passed);
2257 seq_printf(seq, "semantic match miss = %u\n",
2258 stats->semantic_match_miss);
2259 seq_printf(seq, "null node hit= %u\n", stats->null_node_hit);
2260 seq_printf(seq, "skipped node resize = %u\n\n",
2261 stats->resize_node_skipped);
cb7b593c 2262}
66a2f7fd
SH
2263#endif /* CONFIG_IP_FIB_TRIE_STATS */
2264
3d3b2d25 2265static void fib_table_print(struct seq_file *seq, struct fib_table *tb)
d717a9a6 2266{
3d3b2d25
SH
2267 if (tb->tb_id == RT_TABLE_LOCAL)
2268 seq_puts(seq, "Local:\n");
2269 else if (tb->tb_id == RT_TABLE_MAIN)
2270 seq_puts(seq, "Main:\n");
2271 else
2272 seq_printf(seq, "Id %d:\n", tb->tb_id);
d717a9a6 2273}
19baf839 2274
3d3b2d25 2275
cb7b593c
SH
2276static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2277{
1c340b2f 2278 struct net *net = (struct net *)seq->private;
3d3b2d25 2279 unsigned int h;
877a9bff 2280
d717a9a6 2281 seq_printf(seq,
a07f5f50
SH
2282 "Basic info: size of leaf:"
2283 " %Zd bytes, size of tnode: %Zd bytes.\n",
d717a9a6
SH
2284 sizeof(struct leaf), sizeof(struct tnode));
2285
3d3b2d25
SH
2286 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2287 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2288 struct hlist_node *node;
2289 struct fib_table *tb;
2290
2291 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2292 struct trie *t = (struct trie *) tb->tb_data;
2293 struct trie_stat stat;
877a9bff 2294
3d3b2d25
SH
2295 if (!t)
2296 continue;
2297
2298 fib_table_print(seq, tb);
2299
2300 trie_collect_stats(t, &stat);
2301 trie_show_stats(seq, &stat);
2302#ifdef CONFIG_IP_FIB_TRIE_STATS
2303 trie_show_usage(seq, &t->stats);
2304#endif
2305 }
2306 }
19baf839 2307
cb7b593c 2308 return 0;
19baf839
RO
2309}
2310
cb7b593c 2311static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2312{
de05c557 2313 return single_open_net(inode, file, fib_triestat_seq_show);
1c340b2f
DL
2314}
2315
9a32144e 2316static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2317 .owner = THIS_MODULE,
2318 .open = fib_triestat_seq_open,
2319 .read = seq_read,
2320 .llseek = seq_lseek,
b6fcbdb4 2321 .release = single_release_net,
cb7b593c
SH
2322};
2323
1218854a 2324static struct node *fib_trie_get_idx(struct seq_file *seq, loff_t pos)
19baf839 2325{
1218854a
YH
2326 struct fib_trie_iter *iter = seq->private;
2327 struct net *net = seq_file_net(seq);
cb7b593c 2328 loff_t idx = 0;
3d3b2d25 2329 unsigned int h;
cb7b593c 2330
3d3b2d25
SH
2331 for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
2332 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2333 struct hlist_node *node;
2334 struct fib_table *tb;
cb7b593c 2335
3d3b2d25
SH
2336 hlist_for_each_entry_rcu(tb, node, head, tb_hlist) {
2337 struct node *n;
2338
2339 for (n = fib_trie_get_first(iter,
2340 (struct trie *) tb->tb_data);
2341 n; n = fib_trie_get_next(iter))
2342 if (pos == idx++) {
2343 iter->tb = tb;
2344 return n;
2345 }
2346 }
cb7b593c 2347 }
3d3b2d25 2348
19baf839
RO
2349 return NULL;
2350}
2351
cb7b593c 2352static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
c95aaf9a 2353 __acquires(RCU)
19baf839 2354{
cb7b593c 2355 rcu_read_lock();
1218854a 2356 return fib_trie_get_idx(seq, *pos);
19baf839
RO
2357}
2358
cb7b593c 2359static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2360{
cb7b593c 2361 struct fib_trie_iter *iter = seq->private;
1218854a 2362 struct net *net = seq_file_net(seq);
3d3b2d25
SH
2363 struct fib_table *tb = iter->tb;
2364 struct hlist_node *tb_node;
2365 unsigned int h;
2366 struct node *n;
cb7b593c 2367
19baf839 2368 ++*pos;
3d3b2d25
SH
2369 /* next node in same table */
2370 n = fib_trie_get_next(iter);
2371 if (n)
2372 return n;
19baf839 2373
3d3b2d25
SH
2374 /* walk rest of this hash chain */
2375 h = tb->tb_id & (FIB_TABLE_HASHSZ - 1);
2376 while ( (tb_node = rcu_dereference(tb->tb_hlist.next)) ) {
2377 tb = hlist_entry(tb_node, struct fib_table, tb_hlist);
2378 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2379 if (n)
2380 goto found;
2381 }
19baf839 2382
3d3b2d25
SH
2383 /* new hash chain */
2384 while (++h < FIB_TABLE_HASHSZ) {
2385 struct hlist_head *head = &net->ipv4.fib_table_hash[h];
2386 hlist_for_each_entry_rcu(tb, tb_node, head, tb_hlist) {
2387 n = fib_trie_get_first(iter, (struct trie *) tb->tb_data);
2388 if (n)
2389 goto found;
2390 }
2391 }
cb7b593c 2392 return NULL;
3d3b2d25
SH
2393
2394found:
2395 iter->tb = tb;
2396 return n;
cb7b593c 2397}
19baf839 2398
cb7b593c 2399static void fib_trie_seq_stop(struct seq_file *seq, void *v)
c95aaf9a 2400 __releases(RCU)
19baf839 2401{
cb7b593c
SH
2402 rcu_read_unlock();
2403}
91b9a277 2404
cb7b593c
SH
2405static void seq_indent(struct seq_file *seq, int n)
2406{
2407 while (n-- > 0) seq_puts(seq, " ");
2408}
19baf839 2409
28d36e37 2410static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s)
cb7b593c 2411{
132adf54 2412 switch (s) {
cb7b593c
SH
2413 case RT_SCOPE_UNIVERSE: return "universe";
2414 case RT_SCOPE_SITE: return "site";
2415 case RT_SCOPE_LINK: return "link";
2416 case RT_SCOPE_HOST: return "host";
2417 case RT_SCOPE_NOWHERE: return "nowhere";
2418 default:
28d36e37 2419 snprintf(buf, len, "scope=%d", s);
cb7b593c
SH
2420 return buf;
2421 }
2422}
19baf839 2423
36cbd3dc 2424static const char *const rtn_type_names[__RTN_MAX] = {
cb7b593c
SH
2425 [RTN_UNSPEC] = "UNSPEC",
2426 [RTN_UNICAST] = "UNICAST",
2427 [RTN_LOCAL] = "LOCAL",
2428 [RTN_BROADCAST] = "BROADCAST",
2429 [RTN_ANYCAST] = "ANYCAST",
2430 [RTN_MULTICAST] = "MULTICAST",
2431 [RTN_BLACKHOLE] = "BLACKHOLE",
2432 [RTN_UNREACHABLE] = "UNREACHABLE",
2433 [RTN_PROHIBIT] = "PROHIBIT",
2434 [RTN_THROW] = "THROW",
2435 [RTN_NAT] = "NAT",
2436 [RTN_XRESOLVE] = "XRESOLVE",
2437};
19baf839 2438
28d36e37 2439static inline const char *rtn_type(char *buf, size_t len, unsigned t)
cb7b593c 2440{
cb7b593c
SH
2441 if (t < __RTN_MAX && rtn_type_names[t])
2442 return rtn_type_names[t];
28d36e37 2443 snprintf(buf, len, "type %u", t);
cb7b593c 2444 return buf;
19baf839
RO
2445}
2446
cb7b593c
SH
2447/* Pretty print the trie */
2448static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2449{
cb7b593c
SH
2450 const struct fib_trie_iter *iter = seq->private;
2451 struct node *n = v;
c877efb2 2452
3d3b2d25
SH
2453 if (!node_parent_rcu(n))
2454 fib_table_print(seq, iter->tb);
095b8501 2455
cb7b593c
SH
2456 if (IS_TNODE(n)) {
2457 struct tnode *tn = (struct tnode *) n;
ab66b4a7 2458 __be32 prf = htonl(mask_pfx(tn->key, tn->pos));
91b9a277 2459
1d25cd6c 2460 seq_indent(seq, iter->depth-1);
673d57e7
HH
2461 seq_printf(seq, " +-- %pI4/%d %d %d %d\n",
2462 &prf, tn->pos, tn->bits, tn->full_children,
1d25cd6c 2463 tn->empty_children);
e905a9ed 2464
cb7b593c
SH
2465 } else {
2466 struct leaf *l = (struct leaf *) n;
1328042e
SH
2467 struct leaf_info *li;
2468 struct hlist_node *node;
32ab5f80 2469 __be32 val = htonl(l->key);
cb7b593c
SH
2470
2471 seq_indent(seq, iter->depth);
673d57e7 2472 seq_printf(seq, " |-- %pI4\n", &val);
1328042e
SH
2473
2474 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
2475 struct fib_alias *fa;
2476
2477 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2478 char buf1[32], buf2[32];
2479
2480 seq_indent(seq, iter->depth+1);
2481 seq_printf(seq, " /%d %s %s", li->plen,
2482 rtn_scope(buf1, sizeof(buf1),
2483 fa->fa_scope),
2484 rtn_type(buf2, sizeof(buf2),
2485 fa->fa_type));
2486 if (fa->fa_tos)
b9c4d82a 2487 seq_printf(seq, " tos=%d", fa->fa_tos);
1328042e 2488 seq_putc(seq, '\n');
cb7b593c
SH
2489 }
2490 }
19baf839 2491 }
cb7b593c 2492
19baf839
RO
2493 return 0;
2494}
2495
f690808e 2496static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2497 .start = fib_trie_seq_start,
2498 .next = fib_trie_seq_next,
2499 .stop = fib_trie_seq_stop,
2500 .show = fib_trie_seq_show,
19baf839
RO
2501};
2502
cb7b593c 2503static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839 2504{
1c340b2f
DL
2505 return seq_open_net(inode, file, &fib_trie_seq_ops,
2506 sizeof(struct fib_trie_iter));
19baf839
RO
2507}
2508
9a32144e 2509static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2510 .owner = THIS_MODULE,
2511 .open = fib_trie_seq_open,
2512 .read = seq_read,
2513 .llseek = seq_lseek,
1c340b2f 2514 .release = seq_release_net,
19baf839
RO
2515};
2516
8315f5d8
SH
2517struct fib_route_iter {
2518 struct seq_net_private p;
2519 struct trie *main_trie;
2520 loff_t pos;
2521 t_key key;
2522};
2523
2524static struct leaf *fib_route_get_idx(struct fib_route_iter *iter, loff_t pos)
2525{
2526 struct leaf *l = NULL;
2527 struct trie *t = iter->main_trie;
2528
2529 /* use cache location of last found key */
2530 if (iter->pos > 0 && pos >= iter->pos && (l = fib_find_node(t, iter->key)))
2531 pos -= iter->pos;
2532 else {
2533 iter->pos = 0;
2534 l = trie_firstleaf(t);
2535 }
2536
2537 while (l && pos-- > 0) {
2538 iter->pos++;
2539 l = trie_nextleaf(l);
2540 }
2541
2542 if (l)
2543 iter->key = pos; /* remember it */
2544 else
2545 iter->pos = 0; /* forget it */
2546
2547 return l;
2548}
2549
2550static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos)
2551 __acquires(RCU)
2552{
2553 struct fib_route_iter *iter = seq->private;
2554 struct fib_table *tb;
2555
2556 rcu_read_lock();
1218854a 2557 tb = fib_get_table(seq_file_net(seq), RT_TABLE_MAIN);
8315f5d8
SH
2558 if (!tb)
2559 return NULL;
2560
2561 iter->main_trie = (struct trie *) tb->tb_data;
2562 if (*pos == 0)
2563 return SEQ_START_TOKEN;
2564 else
2565 return fib_route_get_idx(iter, *pos - 1);
2566}
2567
2568static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2569{
2570 struct fib_route_iter *iter = seq->private;
2571 struct leaf *l = v;
2572
2573 ++*pos;
2574 if (v == SEQ_START_TOKEN) {
2575 iter->pos = 0;
2576 l = trie_firstleaf(iter->main_trie);
2577 } else {
2578 iter->pos++;
2579 l = trie_nextleaf(l);
2580 }
2581
2582 if (l)
2583 iter->key = l->key;
2584 else
2585 iter->pos = 0;
2586 return l;
2587}
2588
2589static void fib_route_seq_stop(struct seq_file *seq, void *v)
2590 __releases(RCU)
2591{
2592 rcu_read_unlock();
2593}
2594
32ab5f80 2595static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2596{
cb7b593c
SH
2597 static unsigned type2flags[RTN_MAX + 1] = {
2598 [7] = RTF_REJECT, [8] = RTF_REJECT,
2599 };
2600 unsigned flags = type2flags[type];
19baf839 2601
cb7b593c
SH
2602 if (fi && fi->fib_nh->nh_gw)
2603 flags |= RTF_GATEWAY;
32ab5f80 2604 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2605 flags |= RTF_HOST;
2606 flags |= RTF_UP;
2607 return flags;
19baf839
RO
2608}
2609
cb7b593c
SH
2610/*
2611 * This outputs /proc/net/route.
2612 * The format of the file is not supposed to be changed
2613 * and needs to be same as fib_hash output to avoid breaking
2614 * legacy utilities
2615 */
2616static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2617{
cb7b593c 2618 struct leaf *l = v;
1328042e
SH
2619 struct leaf_info *li;
2620 struct hlist_node *node;
19baf839 2621
cb7b593c
SH
2622 if (v == SEQ_START_TOKEN) {
2623 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2624 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2625 "\tWindow\tIRTT");
2626 return 0;
2627 }
19baf839 2628
1328042e 2629 hlist_for_each_entry_rcu(li, node, &l->list, hlist) {
cb7b593c 2630 struct fib_alias *fa;
32ab5f80 2631 __be32 mask, prefix;
91b9a277 2632
cb7b593c
SH
2633 mask = inet_make_mask(li->plen);
2634 prefix = htonl(l->key);
19baf839 2635
cb7b593c 2636 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2637 const struct fib_info *fi = fa->fa_info;
cb7b593c 2638 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
5e659e4c 2639 int len;
19baf839 2640
cb7b593c
SH
2641 if (fa->fa_type == RTN_BROADCAST
2642 || fa->fa_type == RTN_MULTICAST)
2643 continue;
19baf839 2644
cb7b593c 2645 if (fi)
5e659e4c
PE
2646 seq_printf(seq,
2647 "%s\t%08X\t%08X\t%04X\t%d\t%u\t"
2648 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c
SH
2649 fi->fib_dev ? fi->fib_dev->name : "*",
2650 prefix,
2651 fi->fib_nh->nh_gw, flags, 0, 0,
2652 fi->fib_priority,
2653 mask,
a07f5f50
SH
2654 (fi->fib_advmss ?
2655 fi->fib_advmss + 40 : 0),
cb7b593c 2656 fi->fib_window,
5e659e4c 2657 fi->fib_rtt >> 3, &len);
cb7b593c 2658 else
5e659e4c
PE
2659 seq_printf(seq,
2660 "*\t%08X\t%08X\t%04X\t%d\t%u\t"
2661 "%d\t%08X\t%d\t%u\t%u%n",
cb7b593c 2662 prefix, 0, flags, 0, 0, 0,
5e659e4c 2663 mask, 0, 0, 0, &len);
19baf839 2664
5e659e4c 2665 seq_printf(seq, "%*s\n", 127 - len, "");
cb7b593c 2666 }
19baf839
RO
2667 }
2668
2669 return 0;
2670}
2671
f690808e 2672static const struct seq_operations fib_route_seq_ops = {
8315f5d8
SH
2673 .start = fib_route_seq_start,
2674 .next = fib_route_seq_next,
2675 .stop = fib_route_seq_stop,
cb7b593c 2676 .show = fib_route_seq_show,
19baf839
RO
2677};
2678
cb7b593c 2679static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839 2680{
1c340b2f 2681 return seq_open_net(inode, file, &fib_route_seq_ops,
8315f5d8 2682 sizeof(struct fib_route_iter));
19baf839
RO
2683}
2684
9a32144e 2685static const struct file_operations fib_route_fops = {
cb7b593c
SH
2686 .owner = THIS_MODULE,
2687 .open = fib_route_seq_open,
2688 .read = seq_read,
2689 .llseek = seq_lseek,
1c340b2f 2690 .release = seq_release_net,
19baf839
RO
2691};
2692
61a02653 2693int __net_init fib_proc_init(struct net *net)
19baf839 2694{
61a02653 2695 if (!proc_net_fops_create(net, "fib_trie", S_IRUGO, &fib_trie_fops))
cb7b593c
SH
2696 goto out1;
2697
61a02653
DL
2698 if (!proc_net_fops_create(net, "fib_triestat", S_IRUGO,
2699 &fib_triestat_fops))
cb7b593c
SH
2700 goto out2;
2701
61a02653 2702 if (!proc_net_fops_create(net, "route", S_IRUGO, &fib_route_fops))
cb7b593c
SH
2703 goto out3;
2704
19baf839 2705 return 0;
cb7b593c
SH
2706
2707out3:
61a02653 2708 proc_net_remove(net, "fib_triestat");
cb7b593c 2709out2:
61a02653 2710 proc_net_remove(net, "fib_trie");
cb7b593c
SH
2711out1:
2712 return -ENOMEM;
19baf839
RO
2713}
2714
61a02653 2715void __net_exit fib_proc_exit(struct net *net)
19baf839 2716{
61a02653
DL
2717 proc_net_remove(net, "fib_trie");
2718 proc_net_remove(net, "fib_triestat");
2719 proc_net_remove(net, "route");
19baf839
RO
2720}
2721
2722#endif /* CONFIG_PROC_FS */