]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/fib_trie.c
[IPV4] fib_trie: cleanup
[net-next-2.6.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
e905a9ed 10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
19baf839 11 * Agricultural Sciences.
e905a9ed 12 *
19baf839
RO
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
e905a9ed 16 *
19baf839
RO
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
fd966255
RO
44 *
45 * Substantial contributions to this work comes from:
46 *
47 * David S. Miller, <davem@davemloft.net>
48 * Stephen Hemminger <shemminger@osdl.org>
49 * Paul E. McKenney <paulmck@us.ibm.com>
50 * Patrick McHardy <kaber@trash.net>
19baf839
RO
51 */
52
05eee48c 53#define VERSION "0.408"
19baf839 54
19baf839
RO
55#include <asm/uaccess.h>
56#include <asm/system.h>
57#include <asm/bitops.h>
58#include <linux/types.h>
59#include <linux/kernel.h>
19baf839
RO
60#include <linux/mm.h>
61#include <linux/string.h>
62#include <linux/socket.h>
63#include <linux/sockios.h>
64#include <linux/errno.h>
65#include <linux/in.h>
66#include <linux/inet.h>
cd8787ab 67#include <linux/inetdevice.h>
19baf839
RO
68#include <linux/netdevice.h>
69#include <linux/if_arp.h>
70#include <linux/proc_fs.h>
2373ce1c 71#include <linux/rcupdate.h>
19baf839
RO
72#include <linux/skbuff.h>
73#include <linux/netlink.h>
74#include <linux/init.h>
75#include <linux/list.h>
76#include <net/ip.h>
77#include <net/protocol.h>
78#include <net/route.h>
79#include <net/tcp.h>
80#include <net/sock.h>
81#include <net/ip_fib.h>
82#include "fib_lookup.h"
83
84#undef CONFIG_IP_FIB_TRIE_STATS
06ef921d 85#define MAX_STAT_DEPTH 32
19baf839 86
19baf839
RO
87#define KEYLENGTH (8*sizeof(t_key))
88#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
89#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
90
19baf839
RO
91typedef unsigned int t_key;
92
93#define T_TNODE 0
94#define T_LEAF 1
95#define NODE_TYPE_MASK 0x1UL
2373ce1c
RO
96#define NODE_TYPE(node) ((node)->parent & NODE_TYPE_MASK)
97
91b9a277
OJ
98#define IS_TNODE(n) (!(n->parent & T_LEAF))
99#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839
RO
100
101struct node {
91b9a277
OJ
102 t_key key;
103 unsigned long parent;
19baf839
RO
104};
105
106struct leaf {
91b9a277
OJ
107 t_key key;
108 unsigned long parent;
19baf839 109 struct hlist_head list;
2373ce1c 110 struct rcu_head rcu;
19baf839
RO
111};
112
113struct leaf_info {
114 struct hlist_node hlist;
2373ce1c 115 struct rcu_head rcu;
19baf839
RO
116 int plen;
117 struct list_head falh;
118};
119
120struct tnode {
91b9a277
OJ
121 t_key key;
122 unsigned long parent;
123 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
124 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
125 unsigned short full_children; /* KEYLENGTH bits needed */
126 unsigned short empty_children; /* KEYLENGTH bits needed */
2373ce1c 127 struct rcu_head rcu;
91b9a277 128 struct node *child[0];
19baf839
RO
129};
130
131#ifdef CONFIG_IP_FIB_TRIE_STATS
132struct trie_use_stats {
133 unsigned int gets;
134 unsigned int backtrack;
135 unsigned int semantic_match_passed;
136 unsigned int semantic_match_miss;
137 unsigned int null_node_hit;
2f36895a 138 unsigned int resize_node_skipped;
19baf839
RO
139};
140#endif
141
142struct trie_stat {
143 unsigned int totdepth;
144 unsigned int maxdepth;
145 unsigned int tnodes;
146 unsigned int leaves;
147 unsigned int nullpointers;
06ef921d 148 unsigned int nodesizes[MAX_STAT_DEPTH];
c877efb2 149};
19baf839
RO
150
151struct trie {
91b9a277 152 struct node *trie;
19baf839
RO
153#ifdef CONFIG_IP_FIB_TRIE_STATS
154 struct trie_use_stats stats;
155#endif
91b9a277 156 int size;
19baf839
RO
157 unsigned int revision;
158};
159
19baf839
RO
160static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
161static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
19baf839 162static struct node *resize(struct trie *t, struct tnode *tn);
2f80b3c8
RO
163static struct tnode *inflate(struct trie *t, struct tnode *tn);
164static struct tnode *halve(struct trie *t, struct tnode *tn);
19baf839 165static void tnode_free(struct tnode *tn);
19baf839 166
e18b890b 167static struct kmem_cache *fn_alias_kmem __read_mostly;
19baf839
RO
168static struct trie *trie_local = NULL, *trie_main = NULL;
169
06801916
SH
170static inline struct tnode *node_parent(struct node *node)
171{
172 struct tnode *ret;
173
174 ret = (struct tnode *)(node->parent & ~NODE_TYPE_MASK);
175 return rcu_dereference(ret);
176}
177
178static inline void node_set_parent(struct node *node, struct tnode *ptr)
179{
180 rcu_assign_pointer(node->parent,
181 (unsigned long)ptr | NODE_TYPE(node));
182}
2373ce1c
RO
183
184/* rcu_read_lock needs to be hold by caller from readside */
185
c877efb2 186static inline struct node *tnode_get_child(struct tnode *tn, int i)
19baf839 187{
91b9a277 188 BUG_ON(i >= 1 << tn->bits);
19baf839 189
2373ce1c 190 return rcu_dereference(tn->child[i]);
19baf839
RO
191}
192
bb435b8d 193static inline int tnode_child_length(const struct tnode *tn)
19baf839 194{
91b9a277 195 return 1 << tn->bits;
19baf839
RO
196}
197
19baf839
RO
198static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
199{
91b9a277 200 if (offset < KEYLENGTH)
19baf839 201 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 202 else
19baf839
RO
203 return 0;
204}
205
206static inline int tkey_equals(t_key a, t_key b)
207{
c877efb2 208 return a == b;
19baf839
RO
209}
210
211static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
212{
c877efb2
SH
213 if (bits == 0 || offset >= KEYLENGTH)
214 return 1;
91b9a277
OJ
215 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
216 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 217}
19baf839
RO
218
219static inline int tkey_mismatch(t_key a, int offset, t_key b)
220{
221 t_key diff = a ^ b;
222 int i = offset;
223
c877efb2
SH
224 if (!diff)
225 return 0;
226 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
227 i++;
228 return i;
229}
230
19baf839 231/*
e905a9ed
YH
232 To understand this stuff, an understanding of keys and all their bits is
233 necessary. Every node in the trie has a key associated with it, but not
19baf839
RO
234 all of the bits in that key are significant.
235
236 Consider a node 'n' and its parent 'tp'.
237
e905a9ed
YH
238 If n is a leaf, every bit in its key is significant. Its presence is
239 necessitated by path compression, since during a tree traversal (when
240 searching for a leaf - unless we are doing an insertion) we will completely
241 ignore all skipped bits we encounter. Thus we need to verify, at the end of
242 a potentially successful search, that we have indeed been walking the
19baf839
RO
243 correct key path.
244
e905a9ed
YH
245 Note that we can never "miss" the correct key in the tree if present by
246 following the wrong path. Path compression ensures that segments of the key
247 that are the same for all keys with a given prefix are skipped, but the
248 skipped part *is* identical for each node in the subtrie below the skipped
249 bit! trie_insert() in this implementation takes care of that - note the
19baf839
RO
250 call to tkey_sub_equals() in trie_insert().
251
e905a9ed 252 if n is an internal node - a 'tnode' here, the various parts of its key
19baf839
RO
253 have many different meanings.
254
e905a9ed 255 Example:
19baf839
RO
256 _________________________________________________________________
257 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
258 -----------------------------------------------------------------
e905a9ed 259 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
19baf839
RO
260
261 _________________________________________________________________
262 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
263 -----------------------------------------------------------------
264 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
265
266 tp->pos = 7
267 tp->bits = 3
268 n->pos = 15
91b9a277 269 n->bits = 4
19baf839 270
e905a9ed
YH
271 First, let's just ignore the bits that come before the parent tp, that is
272 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
19baf839
RO
273 not use them for anything.
274
275 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
e905a9ed 276 index into the parent's child array. That is, they will be used to find
19baf839
RO
277 'n' among tp's children.
278
279 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
280 for the node n.
281
e905a9ed 282 All the bits we have seen so far are significant to the node n. The rest
19baf839
RO
283 of the bits are really not needed or indeed known in n->key.
284
e905a9ed 285 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
19baf839 286 n's child array, and will of course be different for each child.
e905a9ed 287
c877efb2 288
19baf839
RO
289 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
290 at this point.
291
292*/
293
0c7770c7 294static inline void check_tnode(const struct tnode *tn)
19baf839 295{
0c7770c7 296 WARN_ON(tn && tn->pos+tn->bits > 32);
19baf839
RO
297}
298
299static int halve_threshold = 25;
300static int inflate_threshold = 50;
965ffea4
RO
301static int halve_threshold_root = 8;
302static int inflate_threshold_root = 15;
19baf839 303
2373ce1c
RO
304
305static void __alias_free_mem(struct rcu_head *head)
19baf839 306{
2373ce1c
RO
307 struct fib_alias *fa = container_of(head, struct fib_alias, rcu);
308 kmem_cache_free(fn_alias_kmem, fa);
19baf839
RO
309}
310
2373ce1c 311static inline void alias_free_mem_rcu(struct fib_alias *fa)
19baf839 312{
2373ce1c
RO
313 call_rcu(&fa->rcu, __alias_free_mem);
314}
91b9a277 315
2373ce1c
RO
316static void __leaf_free_rcu(struct rcu_head *head)
317{
318 kfree(container_of(head, struct leaf, rcu));
319}
91b9a277 320
2373ce1c 321static void __leaf_info_free_rcu(struct rcu_head *head)
19baf839 322{
2373ce1c 323 kfree(container_of(head, struct leaf_info, rcu));
19baf839
RO
324}
325
2373ce1c 326static inline void free_leaf_info(struct leaf_info *leaf)
19baf839 327{
2373ce1c 328 call_rcu(&leaf->rcu, __leaf_info_free_rcu);
19baf839
RO
329}
330
f0e36f8c
PM
331static struct tnode *tnode_alloc(unsigned int size)
332{
2373ce1c
RO
333 struct page *pages;
334
335 if (size <= PAGE_SIZE)
336 return kcalloc(size, 1, GFP_KERNEL);
337
338 pages = alloc_pages(GFP_KERNEL|__GFP_ZERO, get_order(size));
339 if (!pages)
340 return NULL;
341
342 return page_address(pages);
f0e36f8c
PM
343}
344
2373ce1c 345static void __tnode_free_rcu(struct rcu_head *head)
f0e36f8c 346{
2373ce1c 347 struct tnode *tn = container_of(head, struct tnode, rcu);
f0e36f8c 348 unsigned int size = sizeof(struct tnode) +
2373ce1c 349 (1 << tn->bits) * sizeof(struct node *);
f0e36f8c
PM
350
351 if (size <= PAGE_SIZE)
352 kfree(tn);
353 else
354 free_pages((unsigned long)tn, get_order(size));
355}
356
2373ce1c
RO
357static inline void tnode_free(struct tnode *tn)
358{
132adf54 359 if (IS_LEAF(tn)) {
550e29bc
RO
360 struct leaf *l = (struct leaf *) tn;
361 call_rcu_bh(&l->rcu, __leaf_free_rcu);
132adf54 362 } else
550e29bc 363 call_rcu(&tn->rcu, __tnode_free_rcu);
2373ce1c
RO
364}
365
366static struct leaf *leaf_new(void)
367{
368 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
369 if (l) {
370 l->parent = T_LEAF;
371 INIT_HLIST_HEAD(&l->list);
372 }
373 return l;
374}
375
376static struct leaf_info *leaf_info_new(int plen)
377{
378 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
379 if (li) {
380 li->plen = plen;
381 INIT_LIST_HEAD(&li->falh);
382 }
383 return li;
384}
385
19baf839
RO
386static struct tnode* tnode_new(t_key key, int pos, int bits)
387{
388 int nchildren = 1<<bits;
389 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
f0e36f8c 390 struct tnode *tn = tnode_alloc(sz);
19baf839 391
91b9a277 392 if (tn) {
19baf839 393 memset(tn, 0, sz);
2373ce1c 394 tn->parent = T_TNODE;
19baf839
RO
395 tn->pos = pos;
396 tn->bits = bits;
397 tn->key = key;
398 tn->full_children = 0;
399 tn->empty_children = 1<<bits;
400 }
c877efb2 401
0c7770c7
SH
402 pr_debug("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
403 (unsigned int) (sizeof(struct node) * 1<<bits));
19baf839
RO
404 return tn;
405}
406
19baf839
RO
407/*
408 * Check whether a tnode 'n' is "full", i.e. it is an internal node
409 * and no bits are skipped. See discussion in dyntree paper p. 6
410 */
411
bb435b8d 412static inline int tnode_full(const struct tnode *tn, const struct node *n)
19baf839 413{
c877efb2 414 if (n == NULL || IS_LEAF(n))
19baf839
RO
415 return 0;
416
417 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
418}
419
c877efb2 420static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
19baf839
RO
421{
422 tnode_put_child_reorg(tn, i, n, -1);
423}
424
c877efb2 425 /*
19baf839
RO
426 * Add a child at position i overwriting the old value.
427 * Update the value of full_children and empty_children.
428 */
429
c877efb2 430static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
19baf839 431{
2373ce1c 432 struct node *chi = tn->child[i];
19baf839
RO
433 int isfull;
434
0c7770c7
SH
435 BUG_ON(i >= 1<<tn->bits);
436
19baf839
RO
437
438 /* update emptyChildren */
439 if (n == NULL && chi != NULL)
440 tn->empty_children++;
441 else if (n != NULL && chi == NULL)
442 tn->empty_children--;
c877efb2 443
19baf839 444 /* update fullChildren */
91b9a277 445 if (wasfull == -1)
19baf839
RO
446 wasfull = tnode_full(tn, chi);
447
448 isfull = tnode_full(tn, n);
c877efb2 449 if (wasfull && !isfull)
19baf839 450 tn->full_children--;
c877efb2 451 else if (!wasfull && isfull)
19baf839 452 tn->full_children++;
91b9a277 453
c877efb2 454 if (n)
06801916 455 node_set_parent(n, tn);
19baf839 456
2373ce1c 457 rcu_assign_pointer(tn->child[i], n);
19baf839
RO
458}
459
c877efb2 460static struct node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
461{
462 int i;
2f36895a 463 int err = 0;
2f80b3c8 464 struct tnode *old_tn;
e6308be8
RO
465 int inflate_threshold_use;
466 int halve_threshold_use;
05eee48c 467 int max_resize;
19baf839 468
e905a9ed 469 if (!tn)
19baf839
RO
470 return NULL;
471
0c7770c7
SH
472 pr_debug("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
473 tn, inflate_threshold, halve_threshold);
19baf839
RO
474
475 /* No children */
476 if (tn->empty_children == tnode_child_length(tn)) {
477 tnode_free(tn);
478 return NULL;
479 }
480 /* One child */
481 if (tn->empty_children == tnode_child_length(tn) - 1)
482 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 483 struct node *n;
19baf839 484
91b9a277 485 n = tn->child[i];
2373ce1c 486 if (!n)
91b9a277 487 continue;
91b9a277
OJ
488
489 /* compress one level */
06801916 490 node_set_parent(n, NULL);
91b9a277
OJ
491 tnode_free(tn);
492 return n;
19baf839 493 }
c877efb2 494 /*
19baf839
RO
495 * Double as long as the resulting node has a number of
496 * nonempty nodes that are above the threshold.
497 */
498
499 /*
c877efb2
SH
500 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
501 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 502 * Telecommunications, page 6:
c877efb2 503 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
504 * children in the *doubled* node is at least 'high'."
505 *
c877efb2
SH
506 * 'high' in this instance is the variable 'inflate_threshold'. It
507 * is expressed as a percentage, so we multiply it with
508 * tnode_child_length() and instead of multiplying by 2 (since the
509 * child array will be doubled by inflate()) and multiplying
510 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 511 * multiply the left-hand side by 50.
c877efb2
SH
512 *
513 * The left-hand side may look a bit weird: tnode_child_length(tn)
514 * - tn->empty_children is of course the number of non-null children
515 * in the current node. tn->full_children is the number of "full"
19baf839 516 * children, that is non-null tnodes with a skip value of 0.
c877efb2 517 * All of those will be doubled in the resulting inflated tnode, so
19baf839 518 * we just count them one extra time here.
c877efb2 519 *
19baf839 520 * A clearer way to write this would be:
c877efb2 521 *
19baf839 522 * to_be_doubled = tn->full_children;
c877efb2 523 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
524 * tn->full_children;
525 *
526 * new_child_length = tnode_child_length(tn) * 2;
527 *
c877efb2 528 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
529 * new_child_length;
530 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
531 *
532 * ...and so on, tho it would mess up the while () loop.
533 *
19baf839
RO
534 * anyway,
535 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
536 * inflate_threshold
c877efb2 537 *
19baf839
RO
538 * avoid a division:
539 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
540 * inflate_threshold * new_child_length
c877efb2 541 *
19baf839 542 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 543 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 544 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 545 *
19baf839 546 * expand new_child_length:
c877efb2 547 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 548 * tn->full_children) >=
19baf839 549 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 550 *
19baf839 551 * shorten again:
c877efb2 552 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 553 * tn->empty_children) >= inflate_threshold *
19baf839 554 * tnode_child_length(tn)
c877efb2 555 *
19baf839
RO
556 */
557
558 check_tnode(tn);
c877efb2 559
e6308be8
RO
560 /* Keep root node larger */
561
132adf54 562 if (!tn->parent)
e6308be8 563 inflate_threshold_use = inflate_threshold_root;
e905a9ed 564 else
e6308be8
RO
565 inflate_threshold_use = inflate_threshold;
566
2f36895a 567 err = 0;
05eee48c
RO
568 max_resize = 10;
569 while ((tn->full_children > 0 && max_resize-- &&
19baf839 570 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
e6308be8 571 inflate_threshold_use * tnode_child_length(tn))) {
19baf839 572
2f80b3c8
RO
573 old_tn = tn;
574 tn = inflate(t, tn);
575 if (IS_ERR(tn)) {
576 tn = old_tn;
2f36895a
RO
577#ifdef CONFIG_IP_FIB_TRIE_STATS
578 t->stats.resize_node_skipped++;
579#endif
580 break;
581 }
19baf839
RO
582 }
583
05eee48c
RO
584 if (max_resize < 0) {
585 if (!tn->parent)
586 printk(KERN_WARNING "Fix inflate_threshold_root. Now=%d size=%d bits\n",
587 inflate_threshold_root, tn->bits);
588 else
589 printk(KERN_WARNING "Fix inflate_threshold. Now=%d size=%d bits\n",
590 inflate_threshold, tn->bits);
591 }
592
19baf839
RO
593 check_tnode(tn);
594
595 /*
596 * Halve as long as the number of empty children in this
597 * node is above threshold.
598 */
2f36895a 599
e6308be8
RO
600
601 /* Keep root node larger */
602
132adf54 603 if (!tn->parent)
e6308be8 604 halve_threshold_use = halve_threshold_root;
e905a9ed 605 else
e6308be8
RO
606 halve_threshold_use = halve_threshold;
607
2f36895a 608 err = 0;
05eee48c
RO
609 max_resize = 10;
610 while (tn->bits > 1 && max_resize-- &&
19baf839 611 100 * (tnode_child_length(tn) - tn->empty_children) <
e6308be8 612 halve_threshold_use * tnode_child_length(tn)) {
2f36895a 613
2f80b3c8
RO
614 old_tn = tn;
615 tn = halve(t, tn);
616 if (IS_ERR(tn)) {
617 tn = old_tn;
2f36895a
RO
618#ifdef CONFIG_IP_FIB_TRIE_STATS
619 t->stats.resize_node_skipped++;
620#endif
621 break;
622 }
623 }
19baf839 624
05eee48c
RO
625 if (max_resize < 0) {
626 if (!tn->parent)
627 printk(KERN_WARNING "Fix halve_threshold_root. Now=%d size=%d bits\n",
628 halve_threshold_root, tn->bits);
629 else
630 printk(KERN_WARNING "Fix halve_threshold. Now=%d size=%d bits\n",
631 halve_threshold, tn->bits);
632 }
c877efb2 633
19baf839 634 /* Only one child remains */
19baf839
RO
635 if (tn->empty_children == tnode_child_length(tn) - 1)
636 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 637 struct node *n;
19baf839 638
91b9a277 639 n = tn->child[i];
2373ce1c 640 if (!n)
91b9a277 641 continue;
91b9a277
OJ
642
643 /* compress one level */
644
06801916 645 node_set_parent(n, NULL);
91b9a277
OJ
646 tnode_free(tn);
647 return n;
19baf839
RO
648 }
649
650 return (struct node *) tn;
651}
652
2f80b3c8 653static struct tnode *inflate(struct trie *t, struct tnode *tn)
19baf839
RO
654{
655 struct tnode *inode;
656 struct tnode *oldtnode = tn;
657 int olen = tnode_child_length(tn);
658 int i;
659
0c7770c7 660 pr_debug("In inflate\n");
19baf839
RO
661
662 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
663
0c7770c7 664 if (!tn)
2f80b3c8 665 return ERR_PTR(-ENOMEM);
2f36895a
RO
666
667 /*
c877efb2
SH
668 * Preallocate and store tnodes before the actual work so we
669 * don't get into an inconsistent state if memory allocation
670 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
671 * of tnode is ignored.
672 */
91b9a277
OJ
673
674 for (i = 0; i < olen; i++) {
2f36895a
RO
675 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
676
677 if (inode &&
678 IS_TNODE(inode) &&
679 inode->pos == oldtnode->pos + oldtnode->bits &&
680 inode->bits > 1) {
681 struct tnode *left, *right;
2f36895a 682 t_key m = TKEY_GET_MASK(inode->pos, 1);
c877efb2 683
2f36895a
RO
684 left = tnode_new(inode->key&(~m), inode->pos + 1,
685 inode->bits - 1);
2f80b3c8
RO
686 if (!left)
687 goto nomem;
91b9a277 688
2f36895a
RO
689 right = tnode_new(inode->key|m, inode->pos + 1,
690 inode->bits - 1);
691
e905a9ed 692 if (!right) {
2f80b3c8
RO
693 tnode_free(left);
694 goto nomem;
e905a9ed 695 }
2f36895a
RO
696
697 put_child(t, tn, 2*i, (struct node *) left);
698 put_child(t, tn, 2*i+1, (struct node *) right);
699 }
700 }
701
91b9a277 702 for (i = 0; i < olen; i++) {
19baf839 703 struct node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
704 struct tnode *left, *right;
705 int size, j;
c877efb2 706
19baf839
RO
707 /* An empty child */
708 if (node == NULL)
709 continue;
710
711 /* A leaf or an internal node with skipped bits */
712
c877efb2 713 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 714 tn->pos + tn->bits - 1) {
c877efb2 715 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
19baf839
RO
716 1) == 0)
717 put_child(t, tn, 2*i, node);
718 else
719 put_child(t, tn, 2*i+1, node);
720 continue;
721 }
722
723 /* An internal node with two children */
724 inode = (struct tnode *) node;
725
726 if (inode->bits == 1) {
727 put_child(t, tn, 2*i, inode->child[0]);
728 put_child(t, tn, 2*i+1, inode->child[1]);
729
730 tnode_free(inode);
91b9a277 731 continue;
19baf839
RO
732 }
733
91b9a277
OJ
734 /* An internal node with more than two children */
735
736 /* We will replace this node 'inode' with two new
737 * ones, 'left' and 'right', each with half of the
738 * original children. The two new nodes will have
739 * a position one bit further down the key and this
740 * means that the "significant" part of their keys
741 * (see the discussion near the top of this file)
742 * will differ by one bit, which will be "0" in
743 * left's key and "1" in right's key. Since we are
744 * moving the key position by one step, the bit that
745 * we are moving away from - the bit at position
746 * (inode->pos) - is the one that will differ between
747 * left and right. So... we synthesize that bit in the
748 * two new keys.
749 * The mask 'm' below will be a single "one" bit at
750 * the position (inode->pos)
751 */
19baf839 752
91b9a277
OJ
753 /* Use the old key, but set the new significant
754 * bit to zero.
755 */
2f36895a 756
91b9a277
OJ
757 left = (struct tnode *) tnode_get_child(tn, 2*i);
758 put_child(t, tn, 2*i, NULL);
2f36895a 759
91b9a277 760 BUG_ON(!left);
2f36895a 761
91b9a277
OJ
762 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
763 put_child(t, tn, 2*i+1, NULL);
19baf839 764
91b9a277 765 BUG_ON(!right);
19baf839 766
91b9a277
OJ
767 size = tnode_child_length(left);
768 for (j = 0; j < size; j++) {
769 put_child(t, left, j, inode->child[j]);
770 put_child(t, right, j, inode->child[j + size]);
19baf839 771 }
91b9a277
OJ
772 put_child(t, tn, 2*i, resize(t, left));
773 put_child(t, tn, 2*i+1, resize(t, right));
774
775 tnode_free(inode);
19baf839
RO
776 }
777 tnode_free(oldtnode);
778 return tn;
2f80b3c8
RO
779nomem:
780 {
781 int size = tnode_child_length(tn);
782 int j;
783
0c7770c7 784 for (j = 0; j < size; j++)
2f80b3c8
RO
785 if (tn->child[j])
786 tnode_free((struct tnode *)tn->child[j]);
787
788 tnode_free(tn);
0c7770c7 789
2f80b3c8
RO
790 return ERR_PTR(-ENOMEM);
791 }
19baf839
RO
792}
793
2f80b3c8 794static struct tnode *halve(struct trie *t, struct tnode *tn)
19baf839
RO
795{
796 struct tnode *oldtnode = tn;
797 struct node *left, *right;
798 int i;
799 int olen = tnode_child_length(tn);
800
0c7770c7 801 pr_debug("In halve\n");
c877efb2
SH
802
803 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 804
2f80b3c8
RO
805 if (!tn)
806 return ERR_PTR(-ENOMEM);
2f36895a
RO
807
808 /*
c877efb2
SH
809 * Preallocate and store tnodes before the actual work so we
810 * don't get into an inconsistent state if memory allocation
811 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
812 * of tnode is ignored.
813 */
814
91b9a277 815 for (i = 0; i < olen; i += 2) {
2f36895a
RO
816 left = tnode_get_child(oldtnode, i);
817 right = tnode_get_child(oldtnode, i+1);
c877efb2 818
2f36895a 819 /* Two nonempty children */
0c7770c7 820 if (left && right) {
2f80b3c8 821 struct tnode *newn;
0c7770c7 822
2f80b3c8 823 newn = tnode_new(left->key, tn->pos + tn->bits, 1);
0c7770c7
SH
824
825 if (!newn)
2f80b3c8 826 goto nomem;
0c7770c7 827
2f80b3c8 828 put_child(t, tn, i/2, (struct node *)newn);
2f36895a 829 }
2f36895a 830
2f36895a 831 }
19baf839 832
91b9a277
OJ
833 for (i = 0; i < olen; i += 2) {
834 struct tnode *newBinNode;
835
19baf839
RO
836 left = tnode_get_child(oldtnode, i);
837 right = tnode_get_child(oldtnode, i+1);
c877efb2 838
19baf839
RO
839 /* At least one of the children is empty */
840 if (left == NULL) {
841 if (right == NULL) /* Both are empty */
842 continue;
843 put_child(t, tn, i/2, right);
91b9a277 844 continue;
0c7770c7 845 }
91b9a277
OJ
846
847 if (right == NULL) {
19baf839 848 put_child(t, tn, i/2, left);
91b9a277
OJ
849 continue;
850 }
c877efb2 851
19baf839 852 /* Two nonempty children */
91b9a277
OJ
853 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
854 put_child(t, tn, i/2, NULL);
91b9a277
OJ
855 put_child(t, newBinNode, 0, left);
856 put_child(t, newBinNode, 1, right);
857 put_child(t, tn, i/2, resize(t, newBinNode));
19baf839
RO
858 }
859 tnode_free(oldtnode);
860 return tn;
2f80b3c8
RO
861nomem:
862 {
863 int size = tnode_child_length(tn);
864 int j;
865
0c7770c7 866 for (j = 0; j < size; j++)
2f80b3c8
RO
867 if (tn->child[j])
868 tnode_free((struct tnode *)tn->child[j]);
869
870 tnode_free(tn);
0c7770c7 871
2f80b3c8
RO
872 return ERR_PTR(-ENOMEM);
873 }
19baf839
RO
874}
875
91b9a277 876static void trie_init(struct trie *t)
19baf839 877{
91b9a277
OJ
878 if (!t)
879 return;
880
881 t->size = 0;
2373ce1c 882 rcu_assign_pointer(t->trie, NULL);
91b9a277 883 t->revision = 0;
19baf839 884#ifdef CONFIG_IP_FIB_TRIE_STATS
91b9a277 885 memset(&t->stats, 0, sizeof(struct trie_use_stats));
19baf839 886#endif
19baf839
RO
887}
888
772cb712 889/* readside must use rcu_read_lock currently dump routines
2373ce1c
RO
890 via get_fa_head and dump */
891
772cb712 892static struct leaf_info *find_leaf_info(struct leaf *l, int plen)
19baf839 893{
772cb712 894 struct hlist_head *head = &l->list;
19baf839
RO
895 struct hlist_node *node;
896 struct leaf_info *li;
897
2373ce1c 898 hlist_for_each_entry_rcu(li, node, head, hlist)
c877efb2 899 if (li->plen == plen)
19baf839 900 return li;
91b9a277 901
19baf839
RO
902 return NULL;
903}
904
905static inline struct list_head * get_fa_head(struct leaf *l, int plen)
906{
772cb712 907 struct leaf_info *li = find_leaf_info(l, plen);
c877efb2 908
91b9a277
OJ
909 if (!li)
910 return NULL;
c877efb2 911
91b9a277 912 return &li->falh;
19baf839
RO
913}
914
915static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
916{
e905a9ed
YH
917 struct leaf_info *li = NULL, *last = NULL;
918 struct hlist_node *node;
919
920 if (hlist_empty(head)) {
921 hlist_add_head_rcu(&new->hlist, head);
922 } else {
923 hlist_for_each_entry(li, node, head, hlist) {
924 if (new->plen > li->plen)
925 break;
926
927 last = li;
928 }
929 if (last)
930 hlist_add_after_rcu(&last->hlist, &new->hlist);
931 else
932 hlist_add_before_rcu(&new->hlist, &li->hlist);
933 }
19baf839
RO
934}
935
2373ce1c
RO
936/* rcu_read_lock needs to be hold by caller from readside */
937
19baf839
RO
938static struct leaf *
939fib_find_node(struct trie *t, u32 key)
940{
941 int pos;
942 struct tnode *tn;
943 struct node *n;
944
945 pos = 0;
2373ce1c 946 n = rcu_dereference(t->trie);
19baf839
RO
947
948 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
949 tn = (struct tnode *) n;
91b9a277 950
19baf839 951 check_tnode(tn);
91b9a277 952
c877efb2 953 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 954 pos = tn->pos + tn->bits;
19baf839 955 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
91b9a277 956 } else
19baf839
RO
957 break;
958 }
959 /* Case we have found a leaf. Compare prefixes */
960
91b9a277
OJ
961 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
962 return (struct leaf *)n;
963
19baf839
RO
964 return NULL;
965}
966
967static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
968{
19baf839 969 int wasfull;
06801916
SH
970 t_key cindex, key = tn->key;
971 struct tnode *tp;
19baf839 972
06801916 973 while (tn != NULL && (tp = node_parent((struct node *)tn)) != NULL) {
19baf839
RO
974 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
975 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
976 tn = (struct tnode *) resize (t, (struct tnode *)tn);
977 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
91b9a277 978
06801916
SH
979 tp = node_parent((struct node *) tn);
980 if (!tp)
19baf839 981 break;
06801916 982 tn = tp;
19baf839 983 }
06801916 984
19baf839 985 /* Handle last (top) tnode */
c877efb2 986 if (IS_TNODE(tn))
19baf839
RO
987 tn = (struct tnode*) resize(t, (struct tnode *)tn);
988
989 return (struct node*) tn;
990}
991
2373ce1c
RO
992/* only used from updater-side */
993
f835e471
RO
994static struct list_head *
995fib_insert_node(struct trie *t, int *err, u32 key, int plen)
19baf839
RO
996{
997 int pos, newpos;
998 struct tnode *tp = NULL, *tn = NULL;
999 struct node *n;
1000 struct leaf *l;
1001 int missbit;
c877efb2 1002 struct list_head *fa_head = NULL;
19baf839
RO
1003 struct leaf_info *li;
1004 t_key cindex;
1005
1006 pos = 0;
c877efb2 1007 n = t->trie;
19baf839 1008
c877efb2
SH
1009 /* If we point to NULL, stop. Either the tree is empty and we should
1010 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 1011 * and we should just put our new leaf in that.
c877efb2
SH
1012 * If we point to a T_TNODE, check if it matches our key. Note that
1013 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1014 * not be the parent's 'pos'+'bits'!
1015 *
c877efb2 1016 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1017 * the index from our key, push the T_TNODE and walk the tree.
1018 *
1019 * If it doesn't, we have to replace it with a new T_TNODE.
1020 *
c877efb2
SH
1021 * If we point to a T_LEAF, it might or might not have the same key
1022 * as we do. If it does, just change the value, update the T_LEAF's
1023 * value, and return it.
19baf839
RO
1024 * If it doesn't, we need to replace it with a T_TNODE.
1025 */
1026
1027 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1028 tn = (struct tnode *) n;
91b9a277 1029
c877efb2 1030 check_tnode(tn);
91b9a277 1031
c877efb2 1032 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1033 tp = tn;
91b9a277 1034 pos = tn->pos + tn->bits;
19baf839
RO
1035 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1036
06801916 1037 BUG_ON(n && node_parent(n) != tn);
91b9a277 1038 } else
19baf839
RO
1039 break;
1040 }
1041
1042 /*
1043 * n ----> NULL, LEAF or TNODE
1044 *
c877efb2 1045 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1046 */
1047
91b9a277 1048 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1049
1050 /* Case 1: n is a leaf. Compare prefixes */
1051
c877efb2 1052 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
91b9a277
OJ
1053 struct leaf *l = (struct leaf *) n;
1054
19baf839 1055 li = leaf_info_new(plen);
91b9a277 1056
c877efb2 1057 if (!li) {
f835e471
RO
1058 *err = -ENOMEM;
1059 goto err;
1060 }
19baf839
RO
1061
1062 fa_head = &li->falh;
1063 insert_leaf_info(&l->list, li);
1064 goto done;
1065 }
1066 t->size++;
1067 l = leaf_new();
1068
c877efb2 1069 if (!l) {
f835e471
RO
1070 *err = -ENOMEM;
1071 goto err;
1072 }
19baf839
RO
1073
1074 l->key = key;
1075 li = leaf_info_new(plen);
1076
c877efb2 1077 if (!li) {
f835e471
RO
1078 tnode_free((struct tnode *) l);
1079 *err = -ENOMEM;
1080 goto err;
1081 }
19baf839
RO
1082
1083 fa_head = &li->falh;
1084 insert_leaf_info(&l->list, li);
1085
19baf839 1086 if (t->trie && n == NULL) {
91b9a277 1087 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839 1088
06801916 1089 node_set_parent((struct node *)l, tp);
19baf839 1090
91b9a277
OJ
1091 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1092 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1093 } else {
1094 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1095 /*
1096 * Add a new tnode here
19baf839
RO
1097 * first tnode need some special handling
1098 */
1099
1100 if (tp)
91b9a277 1101 pos = tp->pos+tp->bits;
19baf839 1102 else
91b9a277
OJ
1103 pos = 0;
1104
c877efb2 1105 if (n) {
19baf839
RO
1106 newpos = tkey_mismatch(key, pos, n->key);
1107 tn = tnode_new(n->key, newpos, 1);
91b9a277 1108 } else {
19baf839 1109 newpos = 0;
c877efb2 1110 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1111 }
19baf839 1112
c877efb2 1113 if (!tn) {
f835e471
RO
1114 free_leaf_info(li);
1115 tnode_free((struct tnode *) l);
1116 *err = -ENOMEM;
1117 goto err;
91b9a277
OJ
1118 }
1119
06801916 1120 node_set_parent((struct node *)tn, tp);
19baf839 1121
91b9a277 1122 missbit = tkey_extract_bits(key, newpos, 1);
19baf839
RO
1123 put_child(t, tn, missbit, (struct node *)l);
1124 put_child(t, tn, 1-missbit, n);
1125
c877efb2 1126 if (tp) {
19baf839
RO
1127 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1128 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
91b9a277 1129 } else {
2373ce1c 1130 rcu_assign_pointer(t->trie, (struct node *)tn); /* First tnode */
19baf839
RO
1131 tp = tn;
1132 }
1133 }
91b9a277
OJ
1134
1135 if (tp && tp->pos + tp->bits > 32)
78c6671a 1136 printk(KERN_WARNING "fib_trie tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
19baf839 1137 tp, tp->pos, tp->bits, key, plen);
91b9a277 1138
19baf839 1139 /* Rebalance the trie */
2373ce1c
RO
1140
1141 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
f835e471
RO
1142done:
1143 t->revision++;
91b9a277 1144err:
19baf839
RO
1145 return fa_head;
1146}
1147
d562f1f8
RO
1148/*
1149 * Caller must hold RTNL.
1150 */
4e902c57 1151static int fn_trie_insert(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1152{
1153 struct trie *t = (struct trie *) tb->tb_data;
1154 struct fib_alias *fa, *new_fa;
c877efb2 1155 struct list_head *fa_head = NULL;
19baf839 1156 struct fib_info *fi;
4e902c57
TG
1157 int plen = cfg->fc_dst_len;
1158 u8 tos = cfg->fc_tos;
19baf839
RO
1159 u32 key, mask;
1160 int err;
1161 struct leaf *l;
1162
1163 if (plen > 32)
1164 return -EINVAL;
1165
4e902c57 1166 key = ntohl(cfg->fc_dst);
19baf839 1167
2dfe55b4 1168 pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen);
19baf839 1169
91b9a277 1170 mask = ntohl(inet_make_mask(plen));
19baf839 1171
c877efb2 1172 if (key & ~mask)
19baf839
RO
1173 return -EINVAL;
1174
1175 key = key & mask;
1176
4e902c57
TG
1177 fi = fib_create_info(cfg);
1178 if (IS_ERR(fi)) {
1179 err = PTR_ERR(fi);
19baf839 1180 goto err;
4e902c57 1181 }
19baf839
RO
1182
1183 l = fib_find_node(t, key);
c877efb2 1184 fa = NULL;
19baf839 1185
c877efb2 1186 if (l) {
19baf839
RO
1187 fa_head = get_fa_head(l, plen);
1188 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1189 }
1190
1191 /* Now fa, if non-NULL, points to the first fib alias
1192 * with the same keys [prefix,tos,priority], if such key already
1193 * exists or to the node before which we will insert new one.
1194 *
1195 * If fa is NULL, we will need to allocate a new one and
1196 * insert to the head of f.
1197 *
1198 * If f is NULL, no fib node matched the destination key
1199 * and we need to allocate a new one of those as well.
1200 */
1201
91b9a277 1202 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
19baf839
RO
1203 struct fib_alias *fa_orig;
1204
1205 err = -EEXIST;
4e902c57 1206 if (cfg->fc_nlflags & NLM_F_EXCL)
19baf839
RO
1207 goto out;
1208
4e902c57 1209 if (cfg->fc_nlflags & NLM_F_REPLACE) {
19baf839
RO
1210 struct fib_info *fi_drop;
1211 u8 state;
1212
2373ce1c 1213 err = -ENOBUFS;
e94b1766 1214 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
2373ce1c
RO
1215 if (new_fa == NULL)
1216 goto out;
19baf839
RO
1217
1218 fi_drop = fa->fa_info;
2373ce1c
RO
1219 new_fa->fa_tos = fa->fa_tos;
1220 new_fa->fa_info = fi;
4e902c57
TG
1221 new_fa->fa_type = cfg->fc_type;
1222 new_fa->fa_scope = cfg->fc_scope;
19baf839 1223 state = fa->fa_state;
2373ce1c 1224 new_fa->fa_state &= ~FA_S_ACCESSED;
19baf839 1225
2373ce1c
RO
1226 list_replace_rcu(&fa->fa_list, &new_fa->fa_list);
1227 alias_free_mem_rcu(fa);
19baf839
RO
1228
1229 fib_release_info(fi_drop);
1230 if (state & FA_S_ACCESSED)
91b9a277 1231 rt_cache_flush(-1);
b8f55831
MK
1232 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen,
1233 tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE);
19baf839 1234
91b9a277 1235 goto succeeded;
19baf839
RO
1236 }
1237 /* Error if we find a perfect match which
1238 * uses the same scope, type, and nexthop
1239 * information.
1240 */
1241 fa_orig = fa;
1242 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1243 if (fa->fa_tos != tos)
1244 break;
1245 if (fa->fa_info->fib_priority != fi->fib_priority)
1246 break;
4e902c57
TG
1247 if (fa->fa_type == cfg->fc_type &&
1248 fa->fa_scope == cfg->fc_scope &&
19baf839
RO
1249 fa->fa_info == fi) {
1250 goto out;
1251 }
1252 }
4e902c57 1253 if (!(cfg->fc_nlflags & NLM_F_APPEND))
19baf839
RO
1254 fa = fa_orig;
1255 }
1256 err = -ENOENT;
4e902c57 1257 if (!(cfg->fc_nlflags & NLM_F_CREATE))
19baf839
RO
1258 goto out;
1259
1260 err = -ENOBUFS;
e94b1766 1261 new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL);
19baf839
RO
1262 if (new_fa == NULL)
1263 goto out;
1264
1265 new_fa->fa_info = fi;
1266 new_fa->fa_tos = tos;
4e902c57
TG
1267 new_fa->fa_type = cfg->fc_type;
1268 new_fa->fa_scope = cfg->fc_scope;
19baf839 1269 new_fa->fa_state = 0;
19baf839
RO
1270 /*
1271 * Insert new entry to the list.
1272 */
1273
c877efb2 1274 if (!fa_head) {
f835e471 1275 err = 0;
b47b2ec1 1276 fa_head = fib_insert_node(t, &err, key, plen);
c877efb2 1277 if (err)
f835e471
RO
1278 goto out_free_new_fa;
1279 }
19baf839 1280
2373ce1c
RO
1281 list_add_tail_rcu(&new_fa->fa_list,
1282 (fa ? &fa->fa_list : fa_head));
19baf839
RO
1283
1284 rt_cache_flush(-1);
4e902c57 1285 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id,
b8f55831 1286 &cfg->fc_nlinfo, 0);
19baf839
RO
1287succeeded:
1288 return 0;
f835e471
RO
1289
1290out_free_new_fa:
1291 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1292out:
1293 fib_release_info(fi);
91b9a277 1294err:
19baf839
RO
1295 return err;
1296}
1297
2373ce1c 1298
772cb712 1299/* should be called with rcu_read_lock */
0c7770c7
SH
1300static inline int check_leaf(struct trie *t, struct leaf *l,
1301 t_key key, int *plen, const struct flowi *flp,
06c74270 1302 struct fib_result *res)
19baf839 1303{
06c74270 1304 int err, i;
888454c5 1305 __be32 mask;
19baf839
RO
1306 struct leaf_info *li;
1307 struct hlist_head *hhead = &l->list;
1308 struct hlist_node *node;
c877efb2 1309
2373ce1c 1310 hlist_for_each_entry_rcu(li, node, hhead, hlist) {
19baf839 1311 i = li->plen;
888454c5
AV
1312 mask = inet_make_mask(i);
1313 if (l->key != (key & ntohl(mask)))
19baf839
RO
1314 continue;
1315
888454c5 1316 if ((err = fib_semantic_match(&li->falh, flp, res, htonl(l->key), mask, i)) <= 0) {
19baf839
RO
1317 *plen = i;
1318#ifdef CONFIG_IP_FIB_TRIE_STATS
1319 t->stats.semantic_match_passed++;
1320#endif
06c74270 1321 return err;
19baf839
RO
1322 }
1323#ifdef CONFIG_IP_FIB_TRIE_STATS
1324 t->stats.semantic_match_miss++;
1325#endif
1326 }
06c74270 1327 return 1;
19baf839
RO
1328}
1329
1330static int
1331fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1332{
1333 struct trie *t = (struct trie *) tb->tb_data;
1334 int plen, ret = 0;
1335 struct node *n;
1336 struct tnode *pn;
1337 int pos, bits;
91b9a277 1338 t_key key = ntohl(flp->fl4_dst);
19baf839
RO
1339 int chopped_off;
1340 t_key cindex = 0;
1341 int current_prefix_length = KEYLENGTH;
91b9a277
OJ
1342 struct tnode *cn;
1343 t_key node_prefix, key_prefix, pref_mismatch;
1344 int mp;
1345
2373ce1c 1346 rcu_read_lock();
91b9a277 1347
2373ce1c 1348 n = rcu_dereference(t->trie);
c877efb2 1349 if (!n)
19baf839
RO
1350 goto failed;
1351
1352#ifdef CONFIG_IP_FIB_TRIE_STATS
1353 t->stats.gets++;
1354#endif
1355
1356 /* Just a leaf? */
1357 if (IS_LEAF(n)) {
06c74270 1358 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
19baf839
RO
1359 goto found;
1360 goto failed;
1361 }
1362 pn = (struct tnode *) n;
1363 chopped_off = 0;
c877efb2 1364
91b9a277 1365 while (pn) {
19baf839
RO
1366 pos = pn->pos;
1367 bits = pn->bits;
1368
c877efb2 1369 if (!chopped_off)
19baf839
RO
1370 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1371
1372 n = tnode_get_child(pn, cindex);
1373
1374 if (n == NULL) {
1375#ifdef CONFIG_IP_FIB_TRIE_STATS
1376 t->stats.null_node_hit++;
1377#endif
1378 goto backtrace;
1379 }
1380
91b9a277
OJ
1381 if (IS_LEAF(n)) {
1382 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1383 goto found;
1384 else
1385 goto backtrace;
1386 }
1387
19baf839
RO
1388#define HL_OPTIMIZE
1389#ifdef HL_OPTIMIZE
91b9a277 1390 cn = (struct tnode *)n;
19baf839 1391
91b9a277
OJ
1392 /*
1393 * It's a tnode, and we can do some extra checks here if we
1394 * like, to avoid descending into a dead-end branch.
1395 * This tnode is in the parent's child array at index
1396 * key[p_pos..p_pos+p_bits] but potentially with some bits
1397 * chopped off, so in reality the index may be just a
1398 * subprefix, padded with zero at the end.
1399 * We can also take a look at any skipped bits in this
1400 * tnode - everything up to p_pos is supposed to be ok,
1401 * and the non-chopped bits of the index (se previous
1402 * paragraph) are also guaranteed ok, but the rest is
1403 * considered unknown.
1404 *
1405 * The skipped bits are key[pos+bits..cn->pos].
1406 */
19baf839 1407
91b9a277
OJ
1408 /* If current_prefix_length < pos+bits, we are already doing
1409 * actual prefix matching, which means everything from
1410 * pos+(bits-chopped_off) onward must be zero along some
1411 * branch of this subtree - otherwise there is *no* valid
1412 * prefix present. Here we can only check the skipped
1413 * bits. Remember, since we have already indexed into the
1414 * parent's child array, we know that the bits we chopped of
1415 * *are* zero.
1416 */
19baf839 1417
91b9a277 1418 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
19baf839 1419
91b9a277
OJ
1420 if (current_prefix_length < pos+bits) {
1421 if (tkey_extract_bits(cn->key, current_prefix_length,
1422 cn->pos - current_prefix_length) != 0 ||
1423 !(cn->child[0]))
1424 goto backtrace;
1425 }
19baf839 1426
91b9a277
OJ
1427 /*
1428 * If chopped_off=0, the index is fully validated and we
1429 * only need to look at the skipped bits for this, the new,
1430 * tnode. What we actually want to do is to find out if
1431 * these skipped bits match our key perfectly, or if we will
1432 * have to count on finding a matching prefix further down,
1433 * because if we do, we would like to have some way of
1434 * verifying the existence of such a prefix at this point.
1435 */
19baf839 1436
91b9a277
OJ
1437 /* The only thing we can do at this point is to verify that
1438 * any such matching prefix can indeed be a prefix to our
1439 * key, and if the bits in the node we are inspecting that
1440 * do not match our key are not ZERO, this cannot be true.
1441 * Thus, find out where there is a mismatch (before cn->pos)
1442 * and verify that all the mismatching bits are zero in the
1443 * new tnode's key.
1444 */
19baf839 1445
91b9a277
OJ
1446 /* Note: We aren't very concerned about the piece of the key
1447 * that precede pn->pos+pn->bits, since these have already been
1448 * checked. The bits after cn->pos aren't checked since these are
1449 * by definition "unknown" at this point. Thus, what we want to
1450 * see is if we are about to enter the "prefix matching" state,
1451 * and in that case verify that the skipped bits that will prevail
1452 * throughout this subtree are zero, as they have to be if we are
1453 * to find a matching prefix.
1454 */
1455
1456 node_prefix = MASK_PFX(cn->key, cn->pos);
1457 key_prefix = MASK_PFX(key, cn->pos);
1458 pref_mismatch = key_prefix^node_prefix;
1459 mp = 0;
1460
1461 /* In short: If skipped bits in this node do not match the search
1462 * key, enter the "prefix matching" state.directly.
1463 */
1464 if (pref_mismatch) {
1465 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1466 mp++;
1467 pref_mismatch = pref_mismatch <<1;
1468 }
1469 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1470
1471 if (key_prefix != 0)
1472 goto backtrace;
1473
1474 if (current_prefix_length >= cn->pos)
1475 current_prefix_length = mp;
c877efb2 1476 }
91b9a277
OJ
1477#endif
1478 pn = (struct tnode *)n; /* Descend */
1479 chopped_off = 0;
1480 continue;
1481
19baf839
RO
1482backtrace:
1483 chopped_off++;
1484
1485 /* As zero don't change the child key (cindex) */
91b9a277 1486 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
19baf839 1487 chopped_off++;
19baf839
RO
1488
1489 /* Decrease current_... with bits chopped off */
1490 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1491 current_prefix_length = pn->pos + pn->bits - chopped_off;
91b9a277 1492
19baf839 1493 /*
c877efb2 1494 * Either we do the actual chop off according or if we have
19baf839
RO
1495 * chopped off all bits in this tnode walk up to our parent.
1496 */
1497
91b9a277 1498 if (chopped_off <= pn->bits) {
19baf839 1499 cindex &= ~(1 << (chopped_off-1));
91b9a277 1500 } else {
06801916
SH
1501 struct tnode *parent = node_parent((struct node *) pn);
1502 if (!parent)
19baf839 1503 goto failed;
91b9a277 1504
19baf839 1505 /* Get Child's index */
06801916
SH
1506 cindex = tkey_extract_bits(pn->key, parent->pos, parent->bits);
1507 pn = parent;
19baf839
RO
1508 chopped_off = 0;
1509
1510#ifdef CONFIG_IP_FIB_TRIE_STATS
1511 t->stats.backtrack++;
1512#endif
1513 goto backtrace;
c877efb2 1514 }
19baf839
RO
1515 }
1516failed:
c877efb2 1517 ret = 1;
19baf839 1518found:
2373ce1c 1519 rcu_read_unlock();
19baf839
RO
1520 return ret;
1521}
1522
2373ce1c 1523/* only called from updater side */
19baf839
RO
1524static int trie_leaf_remove(struct trie *t, t_key key)
1525{
1526 t_key cindex;
1527 struct tnode *tp = NULL;
1528 struct node *n = t->trie;
1529 struct leaf *l;
1530
0c7770c7 1531 pr_debug("entering trie_leaf_remove(%p)\n", n);
19baf839
RO
1532
1533 /* Note that in the case skipped bits, those bits are *not* checked!
c877efb2 1534 * When we finish this, we will have NULL or a T_LEAF, and the
19baf839
RO
1535 * T_LEAF may or may not match our key.
1536 */
1537
91b9a277 1538 while (n != NULL && IS_TNODE(n)) {
19baf839
RO
1539 struct tnode *tn = (struct tnode *) n;
1540 check_tnode(tn);
1541 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1542
06801916 1543 BUG_ON(n && node_parent(n) != tn);
91b9a277 1544 }
19baf839
RO
1545 l = (struct leaf *) n;
1546
c877efb2 1547 if (!n || !tkey_equals(l->key, key))
19baf839 1548 return 0;
c877efb2
SH
1549
1550 /*
1551 * Key found.
1552 * Remove the leaf and rebalance the tree
19baf839
RO
1553 */
1554
1555 t->revision++;
1556 t->size--;
1557
06801916 1558 tp = node_parent(n);
19baf839
RO
1559 tnode_free((struct tnode *) n);
1560
c877efb2 1561 if (tp) {
19baf839
RO
1562 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1563 put_child(t, (struct tnode *)tp, cindex, NULL);
2373ce1c 1564 rcu_assign_pointer(t->trie, trie_rebalance(t, tp));
91b9a277 1565 } else
2373ce1c 1566 rcu_assign_pointer(t->trie, NULL);
19baf839
RO
1567
1568 return 1;
1569}
1570
d562f1f8
RO
1571/*
1572 * Caller must hold RTNL.
1573 */
4e902c57 1574static int fn_trie_delete(struct fib_table *tb, struct fib_config *cfg)
19baf839
RO
1575{
1576 struct trie *t = (struct trie *) tb->tb_data;
1577 u32 key, mask;
4e902c57
TG
1578 int plen = cfg->fc_dst_len;
1579 u8 tos = cfg->fc_tos;
19baf839
RO
1580 struct fib_alias *fa, *fa_to_delete;
1581 struct list_head *fa_head;
1582 struct leaf *l;
91b9a277
OJ
1583 struct leaf_info *li;
1584
c877efb2 1585 if (plen > 32)
19baf839
RO
1586 return -EINVAL;
1587
4e902c57 1588 key = ntohl(cfg->fc_dst);
91b9a277 1589 mask = ntohl(inet_make_mask(plen));
19baf839 1590
c877efb2 1591 if (key & ~mask)
19baf839
RO
1592 return -EINVAL;
1593
1594 key = key & mask;
1595 l = fib_find_node(t, key);
1596
c877efb2 1597 if (!l)
19baf839
RO
1598 return -ESRCH;
1599
1600 fa_head = get_fa_head(l, plen);
1601 fa = fib_find_alias(fa_head, tos, 0);
1602
1603 if (!fa)
1604 return -ESRCH;
1605
0c7770c7 1606 pr_debug("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1607
1608 fa_to_delete = NULL;
1609 fa_head = fa->fa_list.prev;
2373ce1c 1610
19baf839
RO
1611 list_for_each_entry(fa, fa_head, fa_list) {
1612 struct fib_info *fi = fa->fa_info;
1613
1614 if (fa->fa_tos != tos)
1615 break;
1616
4e902c57
TG
1617 if ((!cfg->fc_type || fa->fa_type == cfg->fc_type) &&
1618 (cfg->fc_scope == RT_SCOPE_NOWHERE ||
1619 fa->fa_scope == cfg->fc_scope) &&
1620 (!cfg->fc_protocol ||
1621 fi->fib_protocol == cfg->fc_protocol) &&
1622 fib_nh_match(cfg, fi) == 0) {
19baf839
RO
1623 fa_to_delete = fa;
1624 break;
1625 }
1626 }
1627
91b9a277
OJ
1628 if (!fa_to_delete)
1629 return -ESRCH;
19baf839 1630
91b9a277 1631 fa = fa_to_delete;
4e902c57 1632 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id,
b8f55831 1633 &cfg->fc_nlinfo, 0);
91b9a277
OJ
1634
1635 l = fib_find_node(t, key);
772cb712 1636 li = find_leaf_info(l, plen);
19baf839 1637
2373ce1c 1638 list_del_rcu(&fa->fa_list);
19baf839 1639
91b9a277 1640 if (list_empty(fa_head)) {
2373ce1c 1641 hlist_del_rcu(&li->hlist);
91b9a277 1642 free_leaf_info(li);
2373ce1c 1643 }
19baf839 1644
91b9a277
OJ
1645 if (hlist_empty(&l->list))
1646 trie_leaf_remove(t, key);
19baf839 1647
91b9a277
OJ
1648 if (fa->fa_state & FA_S_ACCESSED)
1649 rt_cache_flush(-1);
19baf839 1650
2373ce1c
RO
1651 fib_release_info(fa->fa_info);
1652 alias_free_mem_rcu(fa);
91b9a277 1653 return 0;
19baf839
RO
1654}
1655
1656static int trie_flush_list(struct trie *t, struct list_head *head)
1657{
1658 struct fib_alias *fa, *fa_node;
1659 int found = 0;
1660
1661 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1662 struct fib_info *fi = fa->fa_info;
19baf839 1663
2373ce1c
RO
1664 if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
1665 list_del_rcu(&fa->fa_list);
1666 fib_release_info(fa->fa_info);
1667 alias_free_mem_rcu(fa);
19baf839
RO
1668 found++;
1669 }
1670 }
1671 return found;
1672}
1673
1674static int trie_flush_leaf(struct trie *t, struct leaf *l)
1675{
1676 int found = 0;
1677 struct hlist_head *lih = &l->list;
1678 struct hlist_node *node, *tmp;
1679 struct leaf_info *li = NULL;
1680
1681 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
19baf839
RO
1682 found += trie_flush_list(t, &li->falh);
1683
1684 if (list_empty(&li->falh)) {
2373ce1c 1685 hlist_del_rcu(&li->hlist);
19baf839
RO
1686 free_leaf_info(li);
1687 }
1688 }
1689 return found;
1690}
1691
2373ce1c
RO
1692/* rcu_read_lock needs to be hold by caller from readside */
1693
19baf839
RO
1694static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1695{
1696 struct node *c = (struct node *) thisleaf;
1697 struct tnode *p;
1698 int idx;
2373ce1c 1699 struct node *trie = rcu_dereference(t->trie);
19baf839 1700
c877efb2 1701 if (c == NULL) {
2373ce1c 1702 if (trie == NULL)
19baf839
RO
1703 return NULL;
1704
2373ce1c
RO
1705 if (IS_LEAF(trie)) /* trie w. just a leaf */
1706 return (struct leaf *) trie;
19baf839 1707
2373ce1c 1708 p = (struct tnode*) trie; /* Start */
91b9a277 1709 } else
06801916 1710 p = node_parent(c);
c877efb2 1711
19baf839
RO
1712 while (p) {
1713 int pos, last;
1714
1715 /* Find the next child of the parent */
c877efb2
SH
1716 if (c)
1717 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1718 else
19baf839
RO
1719 pos = 0;
1720
1721 last = 1 << p->bits;
91b9a277 1722 for (idx = pos; idx < last ; idx++) {
2373ce1c
RO
1723 c = rcu_dereference(p->child[idx]);
1724
1725 if (!c)
91b9a277
OJ
1726 continue;
1727
1728 /* Decend if tnode */
2373ce1c
RO
1729 while (IS_TNODE(c)) {
1730 p = (struct tnode *) c;
e905a9ed 1731 idx = 0;
91b9a277
OJ
1732
1733 /* Rightmost non-NULL branch */
1734 if (p && IS_TNODE(p))
2373ce1c
RO
1735 while (!(c = rcu_dereference(p->child[idx]))
1736 && idx < (1<<p->bits)) idx++;
91b9a277
OJ
1737
1738 /* Done with this tnode? */
2373ce1c 1739 if (idx >= (1 << p->bits) || !c)
91b9a277 1740 goto up;
19baf839 1741 }
2373ce1c 1742 return (struct leaf *) c;
19baf839
RO
1743 }
1744up:
1745 /* No more children go up one step */
91b9a277 1746 c = (struct node *) p;
06801916 1747 p = node_parent(c);
19baf839
RO
1748 }
1749 return NULL; /* Ready. Root of trie */
1750}
1751
d562f1f8
RO
1752/*
1753 * Caller must hold RTNL.
1754 */
19baf839
RO
1755static int fn_trie_flush(struct fib_table *tb)
1756{
1757 struct trie *t = (struct trie *) tb->tb_data;
1758 struct leaf *ll = NULL, *l = NULL;
1759 int found = 0, h;
1760
1761 t->revision++;
1762
91b9a277 1763 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
19baf839
RO
1764 found += trie_flush_leaf(t, l);
1765
1766 if (ll && hlist_empty(&ll->list))
1767 trie_leaf_remove(t, ll->key);
1768 ll = l;
1769 }
1770
1771 if (ll && hlist_empty(&ll->list))
1772 trie_leaf_remove(t, ll->key);
1773
0c7770c7 1774 pr_debug("trie_flush found=%d\n", found);
19baf839
RO
1775 return found;
1776}
1777
91b9a277 1778static int trie_last_dflt = -1;
19baf839
RO
1779
1780static void
1781fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1782{
1783 struct trie *t = (struct trie *) tb->tb_data;
1784 int order, last_idx;
1785 struct fib_info *fi = NULL;
1786 struct fib_info *last_resort;
1787 struct fib_alias *fa = NULL;
1788 struct list_head *fa_head;
1789 struct leaf *l;
1790
1791 last_idx = -1;
1792 last_resort = NULL;
1793 order = -1;
1794
2373ce1c 1795 rcu_read_lock();
c877efb2 1796
19baf839 1797 l = fib_find_node(t, 0);
c877efb2 1798 if (!l)
19baf839
RO
1799 goto out;
1800
1801 fa_head = get_fa_head(l, 0);
c877efb2 1802 if (!fa_head)
19baf839
RO
1803 goto out;
1804
c877efb2 1805 if (list_empty(fa_head))
19baf839
RO
1806 goto out;
1807
2373ce1c 1808 list_for_each_entry_rcu(fa, fa_head, fa_list) {
19baf839 1809 struct fib_info *next_fi = fa->fa_info;
91b9a277 1810
19baf839
RO
1811 if (fa->fa_scope != res->scope ||
1812 fa->fa_type != RTN_UNICAST)
1813 continue;
91b9a277 1814
19baf839
RO
1815 if (next_fi->fib_priority > res->fi->fib_priority)
1816 break;
1817 if (!next_fi->fib_nh[0].nh_gw ||
1818 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1819 continue;
1820 fa->fa_state |= FA_S_ACCESSED;
91b9a277 1821
19baf839
RO
1822 if (fi == NULL) {
1823 if (next_fi != res->fi)
1824 break;
1825 } else if (!fib_detect_death(fi, order, &last_resort,
1826 &last_idx, &trie_last_dflt)) {
1827 if (res->fi)
1828 fib_info_put(res->fi);
1829 res->fi = fi;
1830 atomic_inc(&fi->fib_clntref);
1831 trie_last_dflt = order;
1832 goto out;
1833 }
1834 fi = next_fi;
1835 order++;
1836 }
1837 if (order <= 0 || fi == NULL) {
1838 trie_last_dflt = -1;
1839 goto out;
1840 }
1841
1842 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1843 if (res->fi)
1844 fib_info_put(res->fi);
1845 res->fi = fi;
1846 atomic_inc(&fi->fib_clntref);
1847 trie_last_dflt = order;
1848 goto out;
1849 }
1850 if (last_idx >= 0) {
1851 if (res->fi)
1852 fib_info_put(res->fi);
1853 res->fi = last_resort;
1854 if (last_resort)
1855 atomic_inc(&last_resort->fib_clntref);
1856 }
1857 trie_last_dflt = last_idx;
1858 out:;
2373ce1c 1859 rcu_read_unlock();
19baf839
RO
1860}
1861
c877efb2 1862static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
19baf839
RO
1863 struct sk_buff *skb, struct netlink_callback *cb)
1864{
1865 int i, s_i;
1866 struct fib_alias *fa;
1867
32ab5f80 1868 __be32 xkey = htonl(key);
19baf839 1869
1af5a8c4 1870 s_i = cb->args[4];
19baf839
RO
1871 i = 0;
1872
2373ce1c
RO
1873 /* rcu_read_lock is hold by caller */
1874
1875 list_for_each_entry_rcu(fa, fah, fa_list) {
19baf839
RO
1876 if (i < s_i) {
1877 i++;
1878 continue;
1879 }
78c6671a 1880 BUG_ON(!fa->fa_info);
19baf839
RO
1881
1882 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1883 cb->nlh->nlmsg_seq,
1884 RTM_NEWROUTE,
1885 tb->tb_id,
1886 fa->fa_type,
1887 fa->fa_scope,
be403ea1 1888 xkey,
19baf839
RO
1889 plen,
1890 fa->fa_tos,
90f66914 1891 fa->fa_info, 0) < 0) {
1af5a8c4 1892 cb->args[4] = i;
19baf839 1893 return -1;
91b9a277 1894 }
19baf839
RO
1895 i++;
1896 }
1af5a8c4 1897 cb->args[4] = i;
19baf839
RO
1898 return skb->len;
1899}
1900
c877efb2 1901static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
19baf839
RO
1902 struct netlink_callback *cb)
1903{
1904 int h, s_h;
1905 struct list_head *fa_head;
1906 struct leaf *l = NULL;
19baf839 1907
1af5a8c4 1908 s_h = cb->args[3];
19baf839 1909
91b9a277 1910 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
19baf839
RO
1911 if (h < s_h)
1912 continue;
1913 if (h > s_h)
1af5a8c4
PM
1914 memset(&cb->args[4], 0,
1915 sizeof(cb->args) - 4*sizeof(cb->args[0]));
19baf839
RO
1916
1917 fa_head = get_fa_head(l, plen);
91b9a277 1918
c877efb2 1919 if (!fa_head)
19baf839
RO
1920 continue;
1921
c877efb2 1922 if (list_empty(fa_head))
19baf839
RO
1923 continue;
1924
1925 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
1af5a8c4 1926 cb->args[3] = h;
19baf839
RO
1927 return -1;
1928 }
1929 }
1af5a8c4 1930 cb->args[3] = h;
19baf839
RO
1931 return skb->len;
1932}
1933
1934static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1935{
1936 int m, s_m;
1937 struct trie *t = (struct trie *) tb->tb_data;
1938
1af5a8c4 1939 s_m = cb->args[2];
19baf839 1940
2373ce1c 1941 rcu_read_lock();
91b9a277 1942 for (m = 0; m <= 32; m++) {
19baf839
RO
1943 if (m < s_m)
1944 continue;
1945 if (m > s_m)
1af5a8c4
PM
1946 memset(&cb->args[3], 0,
1947 sizeof(cb->args) - 3*sizeof(cb->args[0]));
19baf839
RO
1948
1949 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1af5a8c4 1950 cb->args[2] = m;
19baf839
RO
1951 goto out;
1952 }
1953 }
2373ce1c 1954 rcu_read_unlock();
1af5a8c4 1955 cb->args[2] = m;
19baf839 1956 return skb->len;
91b9a277 1957out:
2373ce1c 1958 rcu_read_unlock();
19baf839
RO
1959 return -1;
1960}
1961
1962/* Fix more generic FIB names for init later */
1963
1964#ifdef CONFIG_IP_MULTIPLE_TABLES
2dfe55b4 1965struct fib_table * fib_hash_init(u32 id)
19baf839 1966#else
2dfe55b4 1967struct fib_table * __init fib_hash_init(u32 id)
19baf839
RO
1968#endif
1969{
1970 struct fib_table *tb;
1971 struct trie *t;
1972
1973 if (fn_alias_kmem == NULL)
1974 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1975 sizeof(struct fib_alias),
1976 0, SLAB_HWCACHE_ALIGN,
20c2df83 1977 NULL);
19baf839
RO
1978
1979 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1980 GFP_KERNEL);
1981 if (tb == NULL)
1982 return NULL;
1983
1984 tb->tb_id = id;
1985 tb->tb_lookup = fn_trie_lookup;
1986 tb->tb_insert = fn_trie_insert;
1987 tb->tb_delete = fn_trie_delete;
1988 tb->tb_flush = fn_trie_flush;
1989 tb->tb_select_default = fn_trie_select_default;
1990 tb->tb_dump = fn_trie_dump;
1991 memset(tb->tb_data, 0, sizeof(struct trie));
1992
1993 t = (struct trie *) tb->tb_data;
1994
1995 trie_init(t);
1996
c877efb2 1997 if (id == RT_TABLE_LOCAL)
91b9a277 1998 trie_local = t;
c877efb2 1999 else if (id == RT_TABLE_MAIN)
91b9a277 2000 trie_main = t;
19baf839
RO
2001
2002 if (id == RT_TABLE_LOCAL)
78c6671a 2003 printk(KERN_INFO "IPv4 FIB: Using LC-trie version %s\n", VERSION);
19baf839
RO
2004
2005 return tb;
2006}
2007
cb7b593c
SH
2008#ifdef CONFIG_PROC_FS
2009/* Depth first Trie walk iterator */
2010struct fib_trie_iter {
2011 struct tnode *tnode;
2012 struct trie *trie;
2013 unsigned index;
2014 unsigned depth;
2015};
19baf839 2016
cb7b593c 2017static struct node *fib_trie_get_next(struct fib_trie_iter *iter)
19baf839 2018{
cb7b593c
SH
2019 struct tnode *tn = iter->tnode;
2020 unsigned cindex = iter->index;
2021 struct tnode *p;
19baf839 2022
6640e697
EB
2023 /* A single entry routing table */
2024 if (!tn)
2025 return NULL;
2026
cb7b593c
SH
2027 pr_debug("get_next iter={node=%p index=%d depth=%d}\n",
2028 iter->tnode, iter->index, iter->depth);
2029rescan:
2030 while (cindex < (1<<tn->bits)) {
2031 struct node *n = tnode_get_child(tn, cindex);
19baf839 2032
cb7b593c
SH
2033 if (n) {
2034 if (IS_LEAF(n)) {
2035 iter->tnode = tn;
2036 iter->index = cindex + 1;
2037 } else {
2038 /* push down one level */
2039 iter->tnode = (struct tnode *) n;
2040 iter->index = 0;
2041 ++iter->depth;
2042 }
2043 return n;
2044 }
19baf839 2045
cb7b593c
SH
2046 ++cindex;
2047 }
91b9a277 2048
cb7b593c 2049 /* Current node exhausted, pop back up */
06801916 2050 p = node_parent((struct node *)tn);
cb7b593c
SH
2051 if (p) {
2052 cindex = tkey_extract_bits(tn->key, p->pos, p->bits)+1;
2053 tn = p;
2054 --iter->depth;
2055 goto rescan;
19baf839 2056 }
cb7b593c
SH
2057
2058 /* got root? */
2059 return NULL;
19baf839
RO
2060}
2061
cb7b593c
SH
2062static struct node *fib_trie_get_first(struct fib_trie_iter *iter,
2063 struct trie *t)
19baf839 2064{
5ddf0eb2
RO
2065 struct node *n ;
2066
132adf54 2067 if (!t)
5ddf0eb2
RO
2068 return NULL;
2069
2070 n = rcu_dereference(t->trie);
2071
132adf54 2072 if (!iter)
5ddf0eb2 2073 return NULL;
19baf839 2074
6640e697
EB
2075 if (n) {
2076 if (IS_TNODE(n)) {
2077 iter->tnode = (struct tnode *) n;
2078 iter->trie = t;
2079 iter->index = 0;
2080 iter->depth = 1;
2081 } else {
2082 iter->tnode = NULL;
2083 iter->trie = t;
2084 iter->index = 0;
2085 iter->depth = 0;
2086 }
cb7b593c 2087 return n;
91b9a277 2088 }
cb7b593c
SH
2089 return NULL;
2090}
91b9a277 2091
cb7b593c
SH
2092static void trie_collect_stats(struct trie *t, struct trie_stat *s)
2093{
2094 struct node *n;
2095 struct fib_trie_iter iter;
91b9a277 2096
cb7b593c 2097 memset(s, 0, sizeof(*s));
91b9a277 2098
cb7b593c
SH
2099 rcu_read_lock();
2100 for (n = fib_trie_get_first(&iter, t); n;
2101 n = fib_trie_get_next(&iter)) {
2102 if (IS_LEAF(n)) {
2103 s->leaves++;
2104 s->totdepth += iter.depth;
2105 if (iter.depth > s->maxdepth)
2106 s->maxdepth = iter.depth;
2107 } else {
2108 const struct tnode *tn = (const struct tnode *) n;
2109 int i;
2110
2111 s->tnodes++;
132adf54 2112 if (tn->bits < MAX_STAT_DEPTH)
06ef921d
RO
2113 s->nodesizes[tn->bits]++;
2114
cb7b593c
SH
2115 for (i = 0; i < (1<<tn->bits); i++)
2116 if (!tn->child[i])
2117 s->nullpointers++;
19baf839 2118 }
19baf839 2119 }
2373ce1c 2120 rcu_read_unlock();
19baf839
RO
2121}
2122
cb7b593c
SH
2123/*
2124 * This outputs /proc/net/fib_triestats
2125 */
2126static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat)
19baf839 2127{
cb7b593c 2128 unsigned i, max, pointers, bytes, avdepth;
c877efb2 2129
cb7b593c
SH
2130 if (stat->leaves)
2131 avdepth = stat->totdepth*100 / stat->leaves;
2132 else
2133 avdepth = 0;
91b9a277 2134
cb7b593c
SH
2135 seq_printf(seq, "\tAver depth: %d.%02d\n", avdepth / 100, avdepth % 100 );
2136 seq_printf(seq, "\tMax depth: %u\n", stat->maxdepth);
91b9a277 2137
cb7b593c 2138 seq_printf(seq, "\tLeaves: %u\n", stat->leaves);
91b9a277 2139
cb7b593c
SH
2140 bytes = sizeof(struct leaf) * stat->leaves;
2141 seq_printf(seq, "\tInternal nodes: %d\n\t", stat->tnodes);
2142 bytes += sizeof(struct tnode) * stat->tnodes;
19baf839 2143
06ef921d
RO
2144 max = MAX_STAT_DEPTH;
2145 while (max > 0 && stat->nodesizes[max-1] == 0)
cb7b593c 2146 max--;
19baf839 2147
cb7b593c
SH
2148 pointers = 0;
2149 for (i = 1; i <= max; i++)
2150 if (stat->nodesizes[i] != 0) {
2151 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2152 pointers += (1<<i) * stat->nodesizes[i];
2153 }
2154 seq_putc(seq, '\n');
2155 seq_printf(seq, "\tPointers: %d\n", pointers);
2373ce1c 2156
cb7b593c
SH
2157 bytes += sizeof(struct node *) * pointers;
2158 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2159 seq_printf(seq, "Total size: %d kB\n", (bytes + 1023) / 1024);
2373ce1c 2160
cb7b593c
SH
2161#ifdef CONFIG_IP_FIB_TRIE_STATS
2162 seq_printf(seq, "Counters:\n---------\n");
2163 seq_printf(seq,"gets = %d\n", t->stats.gets);
2164 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2165 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2166 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2167 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2168 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
2169#ifdef CLEAR_STATS
2170 memset(&(t->stats), 0, sizeof(t->stats));
2171#endif
2172#endif /* CONFIG_IP_FIB_TRIE_STATS */
2173}
19baf839 2174
cb7b593c
SH
2175static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2176{
2177 struct trie_stat *stat;
91b9a277 2178
cb7b593c
SH
2179 stat = kmalloc(sizeof(*stat), GFP_KERNEL);
2180 if (!stat)
2181 return -ENOMEM;
91b9a277 2182
cb7b593c
SH
2183 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
2184 sizeof(struct leaf), sizeof(struct tnode));
91b9a277 2185
cb7b593c
SH
2186 if (trie_local) {
2187 seq_printf(seq, "Local:\n");
2188 trie_collect_stats(trie_local, stat);
2189 trie_show_stats(seq, stat);
2190 }
91b9a277 2191
cb7b593c
SH
2192 if (trie_main) {
2193 seq_printf(seq, "Main:\n");
2194 trie_collect_stats(trie_main, stat);
2195 trie_show_stats(seq, stat);
19baf839 2196 }
cb7b593c 2197 kfree(stat);
19baf839 2198
cb7b593c 2199 return 0;
19baf839
RO
2200}
2201
cb7b593c 2202static int fib_triestat_seq_open(struct inode *inode, struct file *file)
19baf839 2203{
cb7b593c 2204 return single_open(file, fib_triestat_seq_show, NULL);
19baf839
RO
2205}
2206
9a32144e 2207static const struct file_operations fib_triestat_fops = {
cb7b593c
SH
2208 .owner = THIS_MODULE,
2209 .open = fib_triestat_seq_open,
2210 .read = seq_read,
2211 .llseek = seq_lseek,
2212 .release = single_release,
2213};
2214
2215static struct node *fib_trie_get_idx(struct fib_trie_iter *iter,
2216 loff_t pos)
19baf839 2217{
cb7b593c
SH
2218 loff_t idx = 0;
2219 struct node *n;
2220
2221 for (n = fib_trie_get_first(iter, trie_local);
2222 n; ++idx, n = fib_trie_get_next(iter)) {
2223 if (pos == idx)
2224 return n;
2225 }
2226
2227 for (n = fib_trie_get_first(iter, trie_main);
2228 n; ++idx, n = fib_trie_get_next(iter)) {
2229 if (pos == idx)
2230 return n;
2231 }
19baf839
RO
2232 return NULL;
2233}
2234
cb7b593c 2235static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
19baf839 2236{
cb7b593c
SH
2237 rcu_read_lock();
2238 if (*pos == 0)
91b9a277 2239 return SEQ_START_TOKEN;
cb7b593c 2240 return fib_trie_get_idx(seq->private, *pos - 1);
19baf839
RO
2241}
2242
cb7b593c 2243static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
19baf839 2244{
cb7b593c
SH
2245 struct fib_trie_iter *iter = seq->private;
2246 void *l = v;
2247
19baf839 2248 ++*pos;
91b9a277 2249 if (v == SEQ_START_TOKEN)
cb7b593c 2250 return fib_trie_get_idx(iter, 0);
19baf839 2251
cb7b593c
SH
2252 v = fib_trie_get_next(iter);
2253 BUG_ON(v == l);
2254 if (v)
2255 return v;
19baf839 2256
cb7b593c
SH
2257 /* continue scan in next trie */
2258 if (iter->trie == trie_local)
2259 return fib_trie_get_first(iter, trie_main);
19baf839 2260
cb7b593c
SH
2261 return NULL;
2262}
19baf839 2263
cb7b593c 2264static void fib_trie_seq_stop(struct seq_file *seq, void *v)
19baf839 2265{
cb7b593c
SH
2266 rcu_read_unlock();
2267}
91b9a277 2268
cb7b593c
SH
2269static void seq_indent(struct seq_file *seq, int n)
2270{
2271 while (n-- > 0) seq_puts(seq, " ");
2272}
19baf839 2273
cb7b593c
SH
2274static inline const char *rtn_scope(enum rt_scope_t s)
2275{
2276 static char buf[32];
19baf839 2277
132adf54 2278 switch (s) {
cb7b593c
SH
2279 case RT_SCOPE_UNIVERSE: return "universe";
2280 case RT_SCOPE_SITE: return "site";
2281 case RT_SCOPE_LINK: return "link";
2282 case RT_SCOPE_HOST: return "host";
2283 case RT_SCOPE_NOWHERE: return "nowhere";
2284 default:
2285 snprintf(buf, sizeof(buf), "scope=%d", s);
2286 return buf;
2287 }
2288}
19baf839 2289
cb7b593c
SH
2290static const char *rtn_type_names[__RTN_MAX] = {
2291 [RTN_UNSPEC] = "UNSPEC",
2292 [RTN_UNICAST] = "UNICAST",
2293 [RTN_LOCAL] = "LOCAL",
2294 [RTN_BROADCAST] = "BROADCAST",
2295 [RTN_ANYCAST] = "ANYCAST",
2296 [RTN_MULTICAST] = "MULTICAST",
2297 [RTN_BLACKHOLE] = "BLACKHOLE",
2298 [RTN_UNREACHABLE] = "UNREACHABLE",
2299 [RTN_PROHIBIT] = "PROHIBIT",
2300 [RTN_THROW] = "THROW",
2301 [RTN_NAT] = "NAT",
2302 [RTN_XRESOLVE] = "XRESOLVE",
2303};
19baf839 2304
cb7b593c
SH
2305static inline const char *rtn_type(unsigned t)
2306{
2307 static char buf[32];
19baf839 2308
cb7b593c
SH
2309 if (t < __RTN_MAX && rtn_type_names[t])
2310 return rtn_type_names[t];
2311 snprintf(buf, sizeof(buf), "type %d", t);
2312 return buf;
19baf839
RO
2313}
2314
cb7b593c
SH
2315/* Pretty print the trie */
2316static int fib_trie_seq_show(struct seq_file *seq, void *v)
19baf839 2317{
cb7b593c
SH
2318 const struct fib_trie_iter *iter = seq->private;
2319 struct node *n = v;
c877efb2 2320
cb7b593c
SH
2321 if (v == SEQ_START_TOKEN)
2322 return 0;
19baf839 2323
06801916 2324 if (!node_parent(n)) {
095b8501
RO
2325 if (iter->trie == trie_local)
2326 seq_puts(seq, "<local>:\n");
2327 else
2328 seq_puts(seq, "<main>:\n");
2329 }
2330
cb7b593c
SH
2331 if (IS_TNODE(n)) {
2332 struct tnode *tn = (struct tnode *) n;
32ab5f80 2333 __be32 prf = htonl(MASK_PFX(tn->key, tn->pos));
91b9a277 2334
1d25cd6c
RO
2335 seq_indent(seq, iter->depth-1);
2336 seq_printf(seq, " +-- %d.%d.%d.%d/%d %d %d %d\n",
e905a9ed 2337 NIPQUAD(prf), tn->pos, tn->bits, tn->full_children,
1d25cd6c 2338 tn->empty_children);
e905a9ed 2339
cb7b593c
SH
2340 } else {
2341 struct leaf *l = (struct leaf *) n;
2342 int i;
32ab5f80 2343 __be32 val = htonl(l->key);
cb7b593c
SH
2344
2345 seq_indent(seq, iter->depth);
2346 seq_printf(seq, " |-- %d.%d.%d.%d\n", NIPQUAD(val));
2347 for (i = 32; i >= 0; i--) {
772cb712 2348 struct leaf_info *li = find_leaf_info(l, i);
cb7b593c
SH
2349 if (li) {
2350 struct fib_alias *fa;
2351 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
2352 seq_indent(seq, iter->depth+1);
2353 seq_printf(seq, " /%d %s %s", i,
2354 rtn_scope(fa->fa_scope),
2355 rtn_type(fa->fa_type));
2356 if (fa->fa_tos)
2357 seq_printf(seq, "tos =%d\n",
2358 fa->fa_tos);
2359 seq_putc(seq, '\n');
2360 }
2361 }
2362 }
19baf839 2363 }
cb7b593c 2364
19baf839
RO
2365 return 0;
2366}
2367
f690808e 2368static const struct seq_operations fib_trie_seq_ops = {
cb7b593c
SH
2369 .start = fib_trie_seq_start,
2370 .next = fib_trie_seq_next,
2371 .stop = fib_trie_seq_stop,
2372 .show = fib_trie_seq_show,
19baf839
RO
2373};
2374
cb7b593c 2375static int fib_trie_seq_open(struct inode *inode, struct file *file)
19baf839
RO
2376{
2377 struct seq_file *seq;
2378 int rc = -ENOMEM;
cb7b593c 2379 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
19baf839 2380
cb7b593c
SH
2381 if (!s)
2382 goto out;
2383
2384 rc = seq_open(file, &fib_trie_seq_ops);
19baf839
RO
2385 if (rc)
2386 goto out_kfree;
2387
cb7b593c
SH
2388 seq = file->private_data;
2389 seq->private = s;
2390 memset(s, 0, sizeof(*s));
19baf839
RO
2391out:
2392 return rc;
2393out_kfree:
cb7b593c 2394 kfree(s);
19baf839
RO
2395 goto out;
2396}
2397
9a32144e 2398static const struct file_operations fib_trie_fops = {
cb7b593c
SH
2399 .owner = THIS_MODULE,
2400 .open = fib_trie_seq_open,
2401 .read = seq_read,
2402 .llseek = seq_lseek,
c877efb2 2403 .release = seq_release_private,
19baf839
RO
2404};
2405
32ab5f80 2406static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi)
19baf839 2407{
cb7b593c
SH
2408 static unsigned type2flags[RTN_MAX + 1] = {
2409 [7] = RTF_REJECT, [8] = RTF_REJECT,
2410 };
2411 unsigned flags = type2flags[type];
19baf839 2412
cb7b593c
SH
2413 if (fi && fi->fib_nh->nh_gw)
2414 flags |= RTF_GATEWAY;
32ab5f80 2415 if (mask == htonl(0xFFFFFFFF))
cb7b593c
SH
2416 flags |= RTF_HOST;
2417 flags |= RTF_UP;
2418 return flags;
19baf839
RO
2419}
2420
cb7b593c
SH
2421/*
2422 * This outputs /proc/net/route.
2423 * The format of the file is not supposed to be changed
2424 * and needs to be same as fib_hash output to avoid breaking
2425 * legacy utilities
2426 */
2427static int fib_route_seq_show(struct seq_file *seq, void *v)
19baf839 2428{
c9e53cbe 2429 const struct fib_trie_iter *iter = seq->private;
cb7b593c
SH
2430 struct leaf *l = v;
2431 int i;
2432 char bf[128];
19baf839 2433
cb7b593c
SH
2434 if (v == SEQ_START_TOKEN) {
2435 seq_printf(seq, "%-127s\n", "Iface\tDestination\tGateway "
2436 "\tFlags\tRefCnt\tUse\tMetric\tMask\t\tMTU"
2437 "\tWindow\tIRTT");
2438 return 0;
2439 }
19baf839 2440
c9e53cbe
PM
2441 if (iter->trie == trie_local)
2442 return 0;
cb7b593c
SH
2443 if (IS_TNODE(l))
2444 return 0;
19baf839 2445
cb7b593c 2446 for (i=32; i>=0; i--) {
772cb712 2447 struct leaf_info *li = find_leaf_info(l, i);
cb7b593c 2448 struct fib_alias *fa;
32ab5f80 2449 __be32 mask, prefix;
91b9a277 2450
cb7b593c
SH
2451 if (!li)
2452 continue;
19baf839 2453
cb7b593c
SH
2454 mask = inet_make_mask(li->plen);
2455 prefix = htonl(l->key);
19baf839 2456
cb7b593c 2457 list_for_each_entry_rcu(fa, &li->falh, fa_list) {
1371e37d 2458 const struct fib_info *fi = fa->fa_info;
cb7b593c 2459 unsigned flags = fib_flag_trans(fa->fa_type, mask, fi);
19baf839 2460
cb7b593c
SH
2461 if (fa->fa_type == RTN_BROADCAST
2462 || fa->fa_type == RTN_MULTICAST)
2463 continue;
19baf839 2464
cb7b593c
SH
2465 if (fi)
2466 snprintf(bf, sizeof(bf),
2467 "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2468 fi->fib_dev ? fi->fib_dev->name : "*",
2469 prefix,
2470 fi->fib_nh->nh_gw, flags, 0, 0,
2471 fi->fib_priority,
2472 mask,
2473 (fi->fib_advmss ? fi->fib_advmss + 40 : 0),
2474 fi->fib_window,
2475 fi->fib_rtt >> 3);
2476 else
2477 snprintf(bf, sizeof(bf),
2478 "*\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u",
2479 prefix, 0, flags, 0, 0, 0,
2480 mask, 0, 0, 0);
19baf839 2481
cb7b593c
SH
2482 seq_printf(seq, "%-127s\n", bf);
2483 }
19baf839
RO
2484 }
2485
2486 return 0;
2487}
2488
f690808e 2489static const struct seq_operations fib_route_seq_ops = {
cb7b593c
SH
2490 .start = fib_trie_seq_start,
2491 .next = fib_trie_seq_next,
2492 .stop = fib_trie_seq_stop,
2493 .show = fib_route_seq_show,
19baf839
RO
2494};
2495
cb7b593c 2496static int fib_route_seq_open(struct inode *inode, struct file *file)
19baf839
RO
2497{
2498 struct seq_file *seq;
2499 int rc = -ENOMEM;
cb7b593c 2500 struct fib_trie_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
19baf839 2501
cb7b593c
SH
2502 if (!s)
2503 goto out;
2504
2505 rc = seq_open(file, &fib_route_seq_ops);
19baf839
RO
2506 if (rc)
2507 goto out_kfree;
2508
cb7b593c
SH
2509 seq = file->private_data;
2510 seq->private = s;
2511 memset(s, 0, sizeof(*s));
19baf839
RO
2512out:
2513 return rc;
2514out_kfree:
cb7b593c 2515 kfree(s);
19baf839
RO
2516 goto out;
2517}
2518
9a32144e 2519static const struct file_operations fib_route_fops = {
cb7b593c
SH
2520 .owner = THIS_MODULE,
2521 .open = fib_route_seq_open,
2522 .read = seq_read,
2523 .llseek = seq_lseek,
2524 .release = seq_release_private,
19baf839
RO
2525};
2526
2527int __init fib_proc_init(void)
2528{
cb7b593c
SH
2529 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_fops))
2530 goto out1;
2531
2532 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_fops))
2533 goto out2;
2534
2535 if (!proc_net_fops_create("route", S_IRUGO, &fib_route_fops))
2536 goto out3;
2537
19baf839 2538 return 0;
cb7b593c
SH
2539
2540out3:
2541 proc_net_remove("fib_triestat");
2542out2:
2543 proc_net_remove("fib_trie");
2544out1:
2545 return -ENOMEM;
19baf839
RO
2546}
2547
2548void __init fib_proc_exit(void)
2549{
2550 proc_net_remove("fib_trie");
cb7b593c
SH
2551 proc_net_remove("fib_triestat");
2552 proc_net_remove("route");
19baf839
RO
2553}
2554
2555#endif /* CONFIG_PROC_FS */