]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/fib_trie.c
[IPV4]: FIB Trie cleanups.
[net-next-2.6.git] / net / ipv4 / fib_trie.c
CommitLineData
19baf839
RO
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; either version
5 * 2 of the License, or (at your option) any later version.
6 *
7 * Robert Olsson <robert.olsson@its.uu.se> Uppsala Universitet
8 * & Swedish University of Agricultural Sciences.
9 *
10 * Jens Laas <jens.laas@data.slu.se> Swedish University of
11 * Agricultural Sciences.
12 *
13 * Hans Liss <hans.liss@its.uu.se> Uppsala Universitet
14 *
15 * This work is based on the LPC-trie which is originally descibed in:
16 *
17 * An experimental study of compression methods for dynamic tries
18 * Stefan Nilsson and Matti Tikkanen. Algorithmica, 33(1):19-33, 2002.
19 * http://www.nada.kth.se/~snilsson/public/papers/dyntrie2/
20 *
21 *
22 * IP-address lookup using LC-tries. Stefan Nilsson and Gunnar Karlsson
23 * IEEE Journal on Selected Areas in Communications, 17(6):1083-1092, June 1999
24 *
25 * Version: $Id: fib_trie.c,v 1.3 2005/06/08 14:20:01 robert Exp $
26 *
27 *
28 * Code from fib_hash has been reused which includes the following header:
29 *
30 *
31 * INET An implementation of the TCP/IP protocol suite for the LINUX
32 * operating system. INET is implemented using the BSD Socket
33 * interface as the means of communication with the user level.
34 *
35 * IPv4 FIB: lookup engine and maintenance routines.
36 *
37 *
38 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
39 *
40 * This program is free software; you can redistribute it and/or
41 * modify it under the terms of the GNU General Public License
42 * as published by the Free Software Foundation; either version
43 * 2 of the License, or (at your option) any later version.
44 */
45
2f36895a 46#define VERSION "0.325"
19baf839
RO
47
48#include <linux/config.h>
49#include <asm/uaccess.h>
50#include <asm/system.h>
51#include <asm/bitops.h>
52#include <linux/types.h>
53#include <linux/kernel.h>
54#include <linux/sched.h>
55#include <linux/mm.h>
56#include <linux/string.h>
57#include <linux/socket.h>
58#include <linux/sockios.h>
59#include <linux/errno.h>
60#include <linux/in.h>
61#include <linux/inet.h>
62#include <linux/netdevice.h>
63#include <linux/if_arp.h>
64#include <linux/proc_fs.h>
65#include <linux/skbuff.h>
66#include <linux/netlink.h>
67#include <linux/init.h>
68#include <linux/list.h>
69#include <net/ip.h>
70#include <net/protocol.h>
71#include <net/route.h>
72#include <net/tcp.h>
73#include <net/sock.h>
74#include <net/ip_fib.h>
75#include "fib_lookup.h"
76
77#undef CONFIG_IP_FIB_TRIE_STATS
78#define MAX_CHILDS 16384
79
80#define EXTRACT(p, n, str) ((str)<<(p)>>(32-(n)))
81#define KEYLENGTH (8*sizeof(t_key))
82#define MASK_PFX(k, l) (((l)==0)?0:(k >> (KEYLENGTH-l)) << (KEYLENGTH-l))
83#define TKEY_GET_MASK(offset, bits) (((bits)==0)?0:((t_key)(-1) << (KEYLENGTH - bits) >> offset))
84
85static DEFINE_RWLOCK(fib_lock);
86
87typedef unsigned int t_key;
88
89#define T_TNODE 0
90#define T_LEAF 1
91#define NODE_TYPE_MASK 0x1UL
91b9a277
OJ
92#define NODE_PARENT(node) \
93 ((struct tnode *)((node)->parent & ~NODE_TYPE_MASK))
94#define NODE_SET_PARENT(node, ptr) \
95 ((node)->parent = (((unsigned long)(ptr)) | \
96 ((node)->parent & NODE_TYPE_MASK)))
97#define NODE_INIT_PARENT(node, type) \
98 ((node)->parent = (type))
99#define NODE_TYPE(node) \
100 ((node)->parent & NODE_TYPE_MASK)
101
102#define IS_TNODE(n) (!(n->parent & T_LEAF))
103#define IS_LEAF(n) (n->parent & T_LEAF)
19baf839
RO
104
105struct node {
91b9a277
OJ
106 t_key key;
107 unsigned long parent;
19baf839
RO
108};
109
110struct leaf {
91b9a277
OJ
111 t_key key;
112 unsigned long parent;
19baf839
RO
113 struct hlist_head list;
114};
115
116struct leaf_info {
117 struct hlist_node hlist;
118 int plen;
119 struct list_head falh;
120};
121
122struct tnode {
91b9a277
OJ
123 t_key key;
124 unsigned long parent;
125 unsigned short pos:5; /* 2log(KEYLENGTH) bits needed */
126 unsigned short bits:5; /* 2log(KEYLENGTH) bits needed */
127 unsigned short full_children; /* KEYLENGTH bits needed */
128 unsigned short empty_children; /* KEYLENGTH bits needed */
129 struct node *child[0];
19baf839
RO
130};
131
132#ifdef CONFIG_IP_FIB_TRIE_STATS
133struct trie_use_stats {
134 unsigned int gets;
135 unsigned int backtrack;
136 unsigned int semantic_match_passed;
137 unsigned int semantic_match_miss;
138 unsigned int null_node_hit;
2f36895a 139 unsigned int resize_node_skipped;
19baf839
RO
140};
141#endif
142
143struct trie_stat {
144 unsigned int totdepth;
145 unsigned int maxdepth;
146 unsigned int tnodes;
147 unsigned int leaves;
148 unsigned int nullpointers;
149 unsigned int nodesizes[MAX_CHILDS];
c877efb2 150};
19baf839
RO
151
152struct trie {
91b9a277 153 struct node *trie;
19baf839
RO
154#ifdef CONFIG_IP_FIB_TRIE_STATS
155 struct trie_use_stats stats;
156#endif
91b9a277 157 int size;
19baf839
RO
158 unsigned int revision;
159};
160
161static int trie_debug = 0;
162
91b9a277
OJ
163#define DBG(x...) do { if (trie_debug) printk(x); } while (0)
164
19baf839
RO
165static int tnode_full(struct tnode *tn, struct node *n);
166static void put_child(struct trie *t, struct tnode *tn, int i, struct node *n);
167static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull);
168static int tnode_child_length(struct tnode *tn);
169static struct node *resize(struct trie *t, struct tnode *tn);
2f36895a
RO
170static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err);
171static struct tnode *halve(struct trie *t, struct tnode *tn, int *err);
19baf839
RO
172static void tnode_free(struct tnode *tn);
173static void trie_dump_seq(struct seq_file *seq, struct trie *t);
174extern struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio);
175extern int fib_detect_death(struct fib_info *fi, int order,
91b9a277 176 struct fib_info **last_resort, int *last_idx, int *dflt);
19baf839
RO
177
178extern void rtmsg_fib(int event, u32 key, struct fib_alias *fa, int z, int tb_id,
91b9a277 179 struct nlmsghdr *n, struct netlink_skb_parms *req);
19baf839
RO
180
181static kmem_cache_t *fn_alias_kmem;
182static struct trie *trie_local = NULL, *trie_main = NULL;
183
c877efb2 184static inline struct node *tnode_get_child(struct tnode *tn, int i)
19baf839 185{
91b9a277 186 BUG_ON(i >= 1 << tn->bits);
19baf839 187
91b9a277 188 return tn->child[i];
19baf839
RO
189}
190
191static inline int tnode_child_length(struct tnode *tn)
192{
91b9a277 193 return 1 << tn->bits;
19baf839
RO
194}
195
19baf839
RO
196static inline t_key tkey_extract_bits(t_key a, int offset, int bits)
197{
91b9a277 198 if (offset < KEYLENGTH)
19baf839 199 return ((t_key)(a << offset)) >> (KEYLENGTH - bits);
91b9a277 200 else
19baf839
RO
201 return 0;
202}
203
204static inline int tkey_equals(t_key a, t_key b)
205{
c877efb2 206 return a == b;
19baf839
RO
207}
208
209static inline int tkey_sub_equals(t_key a, int offset, int bits, t_key b)
210{
c877efb2
SH
211 if (bits == 0 || offset >= KEYLENGTH)
212 return 1;
91b9a277
OJ
213 bits = bits > KEYLENGTH ? KEYLENGTH : bits;
214 return ((a ^ b) << offset) >> (KEYLENGTH - bits) == 0;
c877efb2 215}
19baf839
RO
216
217static inline int tkey_mismatch(t_key a, int offset, t_key b)
218{
219 t_key diff = a ^ b;
220 int i = offset;
221
c877efb2
SH
222 if (!diff)
223 return 0;
224 while ((diff << i) >> (KEYLENGTH-1) == 0)
19baf839
RO
225 i++;
226 return i;
227}
228
91b9a277 229/* Candidate for fib_semantics */
19baf839
RO
230
231static void fn_free_alias(struct fib_alias *fa)
232{
233 fib_release_info(fa->fa_info);
234 kmem_cache_free(fn_alias_kmem, fa);
235}
236
237/*
238 To understand this stuff, an understanding of keys and all their bits is
239 necessary. Every node in the trie has a key associated with it, but not
240 all of the bits in that key are significant.
241
242 Consider a node 'n' and its parent 'tp'.
243
244 If n is a leaf, every bit in its key is significant. Its presence is
245 necessitaded by path compression, since during a tree traversal (when
246 searching for a leaf - unless we are doing an insertion) we will completely
247 ignore all skipped bits we encounter. Thus we need to verify, at the end of
248 a potentially successful search, that we have indeed been walking the
249 correct key path.
250
251 Note that we can never "miss" the correct key in the tree if present by
252 following the wrong path. Path compression ensures that segments of the key
253 that are the same for all keys with a given prefix are skipped, but the
254 skipped part *is* identical for each node in the subtrie below the skipped
255 bit! trie_insert() in this implementation takes care of that - note the
256 call to tkey_sub_equals() in trie_insert().
257
258 if n is an internal node - a 'tnode' here, the various parts of its key
259 have many different meanings.
260
261 Example:
262 _________________________________________________________________
263 | i | i | i | i | i | i | i | N | N | N | S | S | S | S | S | C |
264 -----------------------------------------------------------------
265 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
266
267 _________________________________________________________________
268 | C | C | C | u | u | u | u | u | u | u | u | u | u | u | u | u |
269 -----------------------------------------------------------------
270 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
271
272 tp->pos = 7
273 tp->bits = 3
274 n->pos = 15
91b9a277 275 n->bits = 4
19baf839
RO
276
277 First, let's just ignore the bits that come before the parent tp, that is
278 the bits from 0 to (tp->pos-1). They are *known* but at this point we do
279 not use them for anything.
280
281 The bits from (tp->pos) to (tp->pos + tp->bits - 1) - "N", above - are the
282 index into the parent's child array. That is, they will be used to find
283 'n' among tp's children.
284
285 The bits from (tp->pos + tp->bits) to (n->pos - 1) - "S" - are skipped bits
286 for the node n.
287
288 All the bits we have seen so far are significant to the node n. The rest
289 of the bits are really not needed or indeed known in n->key.
290
291 The bits from (n->pos) to (n->pos + n->bits - 1) - "C" - are the index into
292 n's child array, and will of course be different for each child.
293
c877efb2 294
19baf839
RO
295 The rest of the bits, from (n->pos + n->bits) onward, are completely unknown
296 at this point.
297
298*/
299
300static void check_tnode(struct tnode *tn)
301{
c877efb2 302 if (tn && tn->pos+tn->bits > 32) {
19baf839
RO
303 printk("TNODE ERROR tn=%p, pos=%d, bits=%d\n", tn, tn->pos, tn->bits);
304 }
305}
306
307static int halve_threshold = 25;
308static int inflate_threshold = 50;
309
310static struct leaf *leaf_new(void)
311{
312 struct leaf *l = kmalloc(sizeof(struct leaf), GFP_KERNEL);
c877efb2 313 if (l) {
19baf839
RO
314 NODE_INIT_PARENT(l, T_LEAF);
315 INIT_HLIST_HEAD(&l->list);
316 }
317 return l;
318}
319
320static struct leaf_info *leaf_info_new(int plen)
321{
322 struct leaf_info *li = kmalloc(sizeof(struct leaf_info), GFP_KERNEL);
91b9a277
OJ
323
324 if (!li)
325 return NULL;
326
327 li->plen = plen;
328 INIT_LIST_HEAD(&li->falh);
329
19baf839
RO
330 return li;
331}
332
333static inline void free_leaf(struct leaf *l)
334{
335 kfree(l);
336}
337
338static inline void free_leaf_info(struct leaf_info *li)
339{
340 kfree(li);
341}
342
f0e36f8c
PM
343static struct tnode *tnode_alloc(unsigned int size)
344{
345 if (size <= PAGE_SIZE) {
346 return kmalloc(size, GFP_KERNEL);
347 } else {
348 return (struct tnode *)
c877efb2 349 __get_free_pages(GFP_KERNEL, get_order(size));
f0e36f8c
PM
350 }
351}
352
353static void __tnode_free(struct tnode *tn)
354{
355 unsigned int size = sizeof(struct tnode) +
91b9a277 356 (1 << tn->bits) * sizeof(struct node *);
f0e36f8c
PM
357
358 if (size <= PAGE_SIZE)
359 kfree(tn);
360 else
361 free_pages((unsigned long)tn, get_order(size));
362}
363
19baf839
RO
364static struct tnode* tnode_new(t_key key, int pos, int bits)
365{
366 int nchildren = 1<<bits;
367 int sz = sizeof(struct tnode) + nchildren * sizeof(struct node *);
f0e36f8c 368 struct tnode *tn = tnode_alloc(sz);
19baf839 369
91b9a277 370 if (tn) {
19baf839
RO
371 memset(tn, 0, sz);
372 NODE_INIT_PARENT(tn, T_TNODE);
373 tn->pos = pos;
374 tn->bits = bits;
375 tn->key = key;
376 tn->full_children = 0;
377 tn->empty_children = 1<<bits;
378 }
c877efb2 379
91b9a277
OJ
380 DBG("AT %p s=%u %u\n", tn, (unsigned int) sizeof(struct tnode),
381 (unsigned int) (sizeof(struct node) * 1<<bits));
19baf839
RO
382 return tn;
383}
384
385static void tnode_free(struct tnode *tn)
386{
91b9a277
OJ
387 BUG_ON(!tn);
388
c877efb2 389 if (IS_LEAF(tn)) {
19baf839 390 free_leaf((struct leaf *)tn);
91b9a277
OJ
391 DBG("FL %p \n", tn);
392 } else {
f0e36f8c 393 __tnode_free(tn);
91b9a277 394 DBG("FT %p \n", tn);
19baf839
RO
395 }
396}
397
398/*
399 * Check whether a tnode 'n' is "full", i.e. it is an internal node
400 * and no bits are skipped. See discussion in dyntree paper p. 6
401 */
402
403static inline int tnode_full(struct tnode *tn, struct node *n)
404{
c877efb2 405 if (n == NULL || IS_LEAF(n))
19baf839
RO
406 return 0;
407
408 return ((struct tnode *) n)->pos == tn->pos + tn->bits;
409}
410
c877efb2 411static inline void put_child(struct trie *t, struct tnode *tn, int i, struct node *n)
19baf839
RO
412{
413 tnode_put_child_reorg(tn, i, n, -1);
414}
415
c877efb2 416 /*
19baf839
RO
417 * Add a child at position i overwriting the old value.
418 * Update the value of full_children and empty_children.
419 */
420
c877efb2 421static void tnode_put_child_reorg(struct tnode *tn, int i, struct node *n, int wasfull)
19baf839
RO
422{
423 struct node *chi;
424 int isfull;
425
c877efb2 426 if (i >= 1<<tn->bits) {
19baf839 427 printk("bits=%d, i=%d\n", tn->bits, i);
91b9a277 428 BUG();
19baf839
RO
429 }
430 write_lock_bh(&fib_lock);
c877efb2 431 chi = tn->child[i];
19baf839
RO
432
433 /* update emptyChildren */
434 if (n == NULL && chi != NULL)
435 tn->empty_children++;
436 else if (n != NULL && chi == NULL)
437 tn->empty_children--;
c877efb2 438
19baf839 439 /* update fullChildren */
91b9a277 440 if (wasfull == -1)
19baf839
RO
441 wasfull = tnode_full(tn, chi);
442
443 isfull = tnode_full(tn, n);
c877efb2 444 if (wasfull && !isfull)
19baf839 445 tn->full_children--;
c877efb2 446 else if (!wasfull && isfull)
19baf839 447 tn->full_children++;
91b9a277 448
c877efb2
SH
449 if (n)
450 NODE_SET_PARENT(n, tn);
19baf839
RO
451
452 tn->child[i] = n;
453 write_unlock_bh(&fib_lock);
454}
455
c877efb2 456static struct node *resize(struct trie *t, struct tnode *tn)
19baf839
RO
457{
458 int i;
2f36895a 459 int err = 0;
19baf839
RO
460
461 if (!tn)
462 return NULL;
463
91b9a277
OJ
464 DBG("In tnode_resize %p inflate_threshold=%d threshold=%d\n",
465 tn, inflate_threshold, halve_threshold);
19baf839
RO
466
467 /* No children */
468 if (tn->empty_children == tnode_child_length(tn)) {
469 tnode_free(tn);
470 return NULL;
471 }
472 /* One child */
473 if (tn->empty_children == tnode_child_length(tn) - 1)
474 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 475 struct node *n;
19baf839
RO
476
477 write_lock_bh(&fib_lock);
91b9a277
OJ
478 n = tn->child[i];
479 if (!n) {
19baf839 480 write_unlock_bh(&fib_lock);
91b9a277 481 continue;
19baf839 482 }
91b9a277
OJ
483
484 /* compress one level */
485 NODE_INIT_PARENT(n, NODE_TYPE(n));
486
19baf839 487 write_unlock_bh(&fib_lock);
91b9a277
OJ
488 tnode_free(tn);
489 return n;
19baf839 490 }
c877efb2 491 /*
19baf839
RO
492 * Double as long as the resulting node has a number of
493 * nonempty nodes that are above the threshold.
494 */
495
496 /*
c877efb2
SH
497 * From "Implementing a dynamic compressed trie" by Stefan Nilsson of
498 * the Helsinki University of Technology and Matti Tikkanen of Nokia
19baf839 499 * Telecommunications, page 6:
c877efb2 500 * "A node is doubled if the ratio of non-empty children to all
19baf839
RO
501 * children in the *doubled* node is at least 'high'."
502 *
c877efb2
SH
503 * 'high' in this instance is the variable 'inflate_threshold'. It
504 * is expressed as a percentage, so we multiply it with
505 * tnode_child_length() and instead of multiplying by 2 (since the
506 * child array will be doubled by inflate()) and multiplying
507 * the left-hand side by 100 (to handle the percentage thing) we
19baf839 508 * multiply the left-hand side by 50.
c877efb2
SH
509 *
510 * The left-hand side may look a bit weird: tnode_child_length(tn)
511 * - tn->empty_children is of course the number of non-null children
512 * in the current node. tn->full_children is the number of "full"
19baf839 513 * children, that is non-null tnodes with a skip value of 0.
c877efb2 514 * All of those will be doubled in the resulting inflated tnode, so
19baf839 515 * we just count them one extra time here.
c877efb2 516 *
19baf839 517 * A clearer way to write this would be:
c877efb2 518 *
19baf839 519 * to_be_doubled = tn->full_children;
c877efb2 520 * not_to_be_doubled = tnode_child_length(tn) - tn->empty_children -
19baf839
RO
521 * tn->full_children;
522 *
523 * new_child_length = tnode_child_length(tn) * 2;
524 *
c877efb2 525 * new_fill_factor = 100 * (not_to_be_doubled + 2*to_be_doubled) /
19baf839
RO
526 * new_child_length;
527 * if (new_fill_factor >= inflate_threshold)
c877efb2
SH
528 *
529 * ...and so on, tho it would mess up the while () loop.
530 *
19baf839
RO
531 * anyway,
532 * 100 * (not_to_be_doubled + 2*to_be_doubled) / new_child_length >=
533 * inflate_threshold
c877efb2 534 *
19baf839
RO
535 * avoid a division:
536 * 100 * (not_to_be_doubled + 2*to_be_doubled) >=
537 * inflate_threshold * new_child_length
c877efb2 538 *
19baf839 539 * expand not_to_be_doubled and to_be_doubled, and shorten:
c877efb2 540 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 541 * tn->full_children) >= inflate_threshold * new_child_length
c877efb2 542 *
19baf839 543 * expand new_child_length:
c877efb2 544 * 100 * (tnode_child_length(tn) - tn->empty_children +
91b9a277 545 * tn->full_children) >=
19baf839 546 * inflate_threshold * tnode_child_length(tn) * 2
c877efb2 547 *
19baf839 548 * shorten again:
c877efb2 549 * 50 * (tn->full_children + tnode_child_length(tn) -
91b9a277 550 * tn->empty_children) >= inflate_threshold *
19baf839 551 * tnode_child_length(tn)
c877efb2 552 *
19baf839
RO
553 */
554
555 check_tnode(tn);
c877efb2 556
2f36895a 557 err = 0;
19baf839
RO
558 while ((tn->full_children > 0 &&
559 50 * (tn->full_children + tnode_child_length(tn) - tn->empty_children) >=
560 inflate_threshold * tnode_child_length(tn))) {
561
2f36895a
RO
562 tn = inflate(t, tn, &err);
563
c877efb2 564 if (err) {
2f36895a
RO
565#ifdef CONFIG_IP_FIB_TRIE_STATS
566 t->stats.resize_node_skipped++;
567#endif
568 break;
569 }
19baf839
RO
570 }
571
572 check_tnode(tn);
573
574 /*
575 * Halve as long as the number of empty children in this
576 * node is above threshold.
577 */
2f36895a
RO
578
579 err = 0;
19baf839
RO
580 while (tn->bits > 1 &&
581 100 * (tnode_child_length(tn) - tn->empty_children) <
2f36895a
RO
582 halve_threshold * tnode_child_length(tn)) {
583
584 tn = halve(t, tn, &err);
585
c877efb2 586 if (err) {
2f36895a
RO
587#ifdef CONFIG_IP_FIB_TRIE_STATS
588 t->stats.resize_node_skipped++;
589#endif
590 break;
591 }
592 }
19baf839 593
c877efb2 594
19baf839
RO
595 /* Only one child remains */
596
597 if (tn->empty_children == tnode_child_length(tn) - 1)
598 for (i = 0; i < tnode_child_length(tn); i++) {
91b9a277 599 struct node *n;
19baf839 600
91b9a277 601 write_lock_bh(&fib_lock);
19baf839 602
91b9a277
OJ
603 n = tn->child[i];
604 if (!n) {
19baf839 605 write_unlock_bh(&fib_lock);
91b9a277 606 continue;
19baf839 607 }
91b9a277
OJ
608
609 /* compress one level */
610
611 NODE_INIT_PARENT(n, NODE_TYPE(n));
612
19baf839 613 write_unlock_bh(&fib_lock);
91b9a277
OJ
614 tnode_free(tn);
615 return n;
19baf839
RO
616 }
617
618 return (struct node *) tn;
619}
620
2f36895a 621static struct tnode *inflate(struct trie *t, struct tnode *tn, int *err)
19baf839
RO
622{
623 struct tnode *inode;
624 struct tnode *oldtnode = tn;
625 int olen = tnode_child_length(tn);
626 int i;
627
91b9a277 628 DBG("In inflate\n");
19baf839
RO
629
630 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits + 1);
631
2f36895a
RO
632 if (!tn) {
633 *err = -ENOMEM;
634 return oldtnode;
635 }
636
637 /*
c877efb2
SH
638 * Preallocate and store tnodes before the actual work so we
639 * don't get into an inconsistent state if memory allocation
640 * fails. In case of failure we return the oldnode and inflate
2f36895a
RO
641 * of tnode is ignored.
642 */
91b9a277
OJ
643
644 for (i = 0; i < olen; i++) {
2f36895a
RO
645 struct tnode *inode = (struct tnode *) tnode_get_child(oldtnode, i);
646
647 if (inode &&
648 IS_TNODE(inode) &&
649 inode->pos == oldtnode->pos + oldtnode->bits &&
650 inode->bits > 1) {
651 struct tnode *left, *right;
2f36895a 652 t_key m = TKEY_GET_MASK(inode->pos, 1);
c877efb2 653
2f36895a
RO
654 left = tnode_new(inode->key&(~m), inode->pos + 1,
655 inode->bits - 1);
656
c877efb2
SH
657 if (!left) {
658 *err = -ENOMEM;
2f36895a
RO
659 break;
660 }
91b9a277 661
2f36895a
RO
662 right = tnode_new(inode->key|m, inode->pos + 1,
663 inode->bits - 1);
664
c877efb2
SH
665 if (!right) {
666 *err = -ENOMEM;
2f36895a
RO
667 break;
668 }
669
670 put_child(t, tn, 2*i, (struct node *) left);
671 put_child(t, tn, 2*i+1, (struct node *) right);
672 }
673 }
674
c877efb2 675 if (*err) {
2f36895a
RO
676 int size = tnode_child_length(tn);
677 int j;
678
91b9a277 679 for (j = 0; j < size; j++)
c877efb2 680 if (tn->child[j])
2f36895a
RO
681 tnode_free((struct tnode *)tn->child[j]);
682
683 tnode_free(tn);
91b9a277 684
2f36895a
RO
685 *err = -ENOMEM;
686 return oldtnode;
687 }
19baf839 688
91b9a277 689 for (i = 0; i < olen; i++) {
19baf839 690 struct node *node = tnode_get_child(oldtnode, i);
91b9a277
OJ
691 struct tnode *left, *right;
692 int size, j;
c877efb2 693
19baf839
RO
694 /* An empty child */
695 if (node == NULL)
696 continue;
697
698 /* A leaf or an internal node with skipped bits */
699
c877efb2 700 if (IS_LEAF(node) || ((struct tnode *) node)->pos >
19baf839 701 tn->pos + tn->bits - 1) {
c877efb2 702 if (tkey_extract_bits(node->key, oldtnode->pos + oldtnode->bits,
19baf839
RO
703 1) == 0)
704 put_child(t, tn, 2*i, node);
705 else
706 put_child(t, tn, 2*i+1, node);
707 continue;
708 }
709
710 /* An internal node with two children */
711 inode = (struct tnode *) node;
712
713 if (inode->bits == 1) {
714 put_child(t, tn, 2*i, inode->child[0]);
715 put_child(t, tn, 2*i+1, inode->child[1]);
716
717 tnode_free(inode);
91b9a277 718 continue;
19baf839
RO
719 }
720
91b9a277
OJ
721 /* An internal node with more than two children */
722
723 /* We will replace this node 'inode' with two new
724 * ones, 'left' and 'right', each with half of the
725 * original children. The two new nodes will have
726 * a position one bit further down the key and this
727 * means that the "significant" part of their keys
728 * (see the discussion near the top of this file)
729 * will differ by one bit, which will be "0" in
730 * left's key and "1" in right's key. Since we are
731 * moving the key position by one step, the bit that
732 * we are moving away from - the bit at position
733 * (inode->pos) - is the one that will differ between
734 * left and right. So... we synthesize that bit in the
735 * two new keys.
736 * The mask 'm' below will be a single "one" bit at
737 * the position (inode->pos)
738 */
19baf839 739
91b9a277
OJ
740 /* Use the old key, but set the new significant
741 * bit to zero.
742 */
2f36895a 743
91b9a277
OJ
744 left = (struct tnode *) tnode_get_child(tn, 2*i);
745 put_child(t, tn, 2*i, NULL);
2f36895a 746
91b9a277 747 BUG_ON(!left);
2f36895a 748
91b9a277
OJ
749 right = (struct tnode *) tnode_get_child(tn, 2*i+1);
750 put_child(t, tn, 2*i+1, NULL);
19baf839 751
91b9a277 752 BUG_ON(!right);
19baf839 753
91b9a277
OJ
754 size = tnode_child_length(left);
755 for (j = 0; j < size; j++) {
756 put_child(t, left, j, inode->child[j]);
757 put_child(t, right, j, inode->child[j + size]);
19baf839 758 }
91b9a277
OJ
759 put_child(t, tn, 2*i, resize(t, left));
760 put_child(t, tn, 2*i+1, resize(t, right));
761
762 tnode_free(inode);
19baf839
RO
763 }
764 tnode_free(oldtnode);
765 return tn;
766}
767
2f36895a 768static struct tnode *halve(struct trie *t, struct tnode *tn, int *err)
19baf839
RO
769{
770 struct tnode *oldtnode = tn;
771 struct node *left, *right;
772 int i;
773 int olen = tnode_child_length(tn);
774
91b9a277 775 DBG("In halve\n");
c877efb2
SH
776
777 tn = tnode_new(oldtnode->key, oldtnode->pos, oldtnode->bits - 1);
19baf839 778
2f36895a
RO
779 if (!tn) {
780 *err = -ENOMEM;
781 return oldtnode;
782 }
783
784 /*
c877efb2
SH
785 * Preallocate and store tnodes before the actual work so we
786 * don't get into an inconsistent state if memory allocation
787 * fails. In case of failure we return the oldnode and halve
2f36895a
RO
788 * of tnode is ignored.
789 */
790
91b9a277 791 for (i = 0; i < olen; i += 2) {
2f36895a
RO
792 left = tnode_get_child(oldtnode, i);
793 right = tnode_get_child(oldtnode, i+1);
c877efb2 794
2f36895a 795 /* Two nonempty children */
c877efb2 796 if (left && right) {
2f36895a
RO
797 struct tnode *newBinNode =
798 tnode_new(left->key, tn->pos + tn->bits, 1);
799
c877efb2
SH
800 if (!newBinNode) {
801 *err = -ENOMEM;
2f36895a
RO
802 break;
803 }
804 put_child(t, tn, i/2, (struct node *)newBinNode);
805 }
806 }
807
c877efb2 808 if (*err) {
2f36895a
RO
809 int size = tnode_child_length(tn);
810 int j;
811
91b9a277 812 for (j = 0; j < size; j++)
c877efb2 813 if (tn->child[j])
2f36895a
RO
814 tnode_free((struct tnode *)tn->child[j]);
815
816 tnode_free(tn);
91b9a277 817
2f36895a
RO
818 *err = -ENOMEM;
819 return oldtnode;
820 }
19baf839 821
91b9a277
OJ
822 for (i = 0; i < olen; i += 2) {
823 struct tnode *newBinNode;
824
19baf839
RO
825 left = tnode_get_child(oldtnode, i);
826 right = tnode_get_child(oldtnode, i+1);
c877efb2 827
19baf839
RO
828 /* At least one of the children is empty */
829 if (left == NULL) {
830 if (right == NULL) /* Both are empty */
831 continue;
832 put_child(t, tn, i/2, right);
91b9a277
OJ
833 continue;
834 }
835
836 if (right == NULL) {
19baf839 837 put_child(t, tn, i/2, left);
91b9a277
OJ
838 continue;
839 }
c877efb2 840
19baf839 841 /* Two nonempty children */
91b9a277
OJ
842 newBinNode = (struct tnode *) tnode_get_child(tn, i/2);
843 put_child(t, tn, i/2, NULL);
19baf839 844
91b9a277 845 BUG_ON(!newBinNode);
19baf839 846
91b9a277
OJ
847 put_child(t, newBinNode, 0, left);
848 put_child(t, newBinNode, 1, right);
849 put_child(t, tn, i/2, resize(t, newBinNode));
19baf839
RO
850 }
851 tnode_free(oldtnode);
852 return tn;
853}
854
91b9a277 855static void trie_init(struct trie *t)
19baf839 856{
91b9a277
OJ
857 if (!t)
858 return;
859
860 t->size = 0;
861 t->trie = NULL;
862 t->revision = 0;
19baf839 863#ifdef CONFIG_IP_FIB_TRIE_STATS
91b9a277 864 memset(&t->stats, 0, sizeof(struct trie_use_stats));
19baf839 865#endif
19baf839
RO
866}
867
868static struct leaf_info *find_leaf_info(struct hlist_head *head, int plen)
869{
870 struct hlist_node *node;
871 struct leaf_info *li;
872
91b9a277 873 hlist_for_each_entry(li, node, head, hlist)
c877efb2 874 if (li->plen == plen)
19baf839 875 return li;
91b9a277 876
19baf839
RO
877 return NULL;
878}
879
880static inline struct list_head * get_fa_head(struct leaf *l, int plen)
881{
19baf839 882 struct leaf_info *li = find_leaf_info(&l->list, plen);
c877efb2 883
91b9a277
OJ
884 if (!li)
885 return NULL;
c877efb2 886
91b9a277 887 return &li->falh;
19baf839
RO
888}
889
890static void insert_leaf_info(struct hlist_head *head, struct leaf_info *new)
891{
c877efb2 892 struct leaf_info *li = NULL, *last = NULL;
91b9a277 893 struct hlist_node *node;
19baf839
RO
894
895 write_lock_bh(&fib_lock);
c877efb2 896
91b9a277 897 if (hlist_empty(head)) {
19baf839 898 hlist_add_head(&new->hlist, head);
91b9a277
OJ
899 } else {
900 hlist_for_each_entry(li, node, head, hlist) {
c877efb2 901 if (new->plen > li->plen)
19baf839 902 break;
91b9a277 903
19baf839
RO
904 last = li;
905 }
c877efb2 906 if (last)
19baf839 907 hlist_add_after(&last->hlist, &new->hlist);
c877efb2 908 else
19baf839
RO
909 hlist_add_before(&new->hlist, &li->hlist);
910 }
911 write_unlock_bh(&fib_lock);
912}
913
914static struct leaf *
915fib_find_node(struct trie *t, u32 key)
916{
917 int pos;
918 struct tnode *tn;
919 struct node *n;
920
921 pos = 0;
c877efb2 922 n = t->trie;
19baf839
RO
923
924 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
925 tn = (struct tnode *) n;
91b9a277 926
19baf839 927 check_tnode(tn);
91b9a277 928
c877efb2 929 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
91b9a277 930 pos = tn->pos + tn->bits;
19baf839 931 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
91b9a277 932 } else
19baf839
RO
933 break;
934 }
935 /* Case we have found a leaf. Compare prefixes */
936
91b9a277
OJ
937 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key))
938 return (struct leaf *)n;
939
19baf839
RO
940 return NULL;
941}
942
943static struct node *trie_rebalance(struct trie *t, struct tnode *tn)
944{
91b9a277 945 int i;
19baf839
RO
946 int wasfull;
947 t_key cindex, key;
948 struct tnode *tp = NULL;
949
91b9a277 950 BUG_ON(!tn);
c877efb2 951
19baf839
RO
952 key = tn->key;
953 i = 0;
954
955 while (tn != NULL && NODE_PARENT(tn) != NULL) {
c877efb2 956 if (i > 10) {
19baf839 957 printk("Rebalance tn=%p \n", tn);
91b9a277
OJ
958 if (tn)
959 printk("tn->parent=%p \n", NODE_PARENT(tn));
960
19baf839 961 printk("Rebalance tp=%p \n", tp);
91b9a277
OJ
962 if (tp)
963 printk("tp->parent=%p \n", NODE_PARENT(tp));
19baf839
RO
964 }
965
91b9a277 966 BUG_ON(i > 12); /* Why is this a bug? -ojn */
19baf839
RO
967 i++;
968
969 tp = NODE_PARENT(tn);
970 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
971 wasfull = tnode_full(tp, tnode_get_child(tp, cindex));
972 tn = (struct tnode *) resize (t, (struct tnode *)tn);
973 tnode_put_child_reorg((struct tnode *)tp, cindex,(struct node*)tn, wasfull);
91b9a277 974
c877efb2 975 if (!NODE_PARENT(tn))
19baf839
RO
976 break;
977
978 tn = NODE_PARENT(tn);
979 }
980 /* Handle last (top) tnode */
c877efb2 981 if (IS_TNODE(tn))
19baf839
RO
982 tn = (struct tnode*) resize(t, (struct tnode *)tn);
983
984 return (struct node*) tn;
985}
986
f835e471
RO
987static struct list_head *
988fib_insert_node(struct trie *t, int *err, u32 key, int plen)
19baf839
RO
989{
990 int pos, newpos;
991 struct tnode *tp = NULL, *tn = NULL;
992 struct node *n;
993 struct leaf *l;
994 int missbit;
c877efb2 995 struct list_head *fa_head = NULL;
19baf839
RO
996 struct leaf_info *li;
997 t_key cindex;
998
999 pos = 0;
c877efb2 1000 n = t->trie;
19baf839 1001
c877efb2
SH
1002 /* If we point to NULL, stop. Either the tree is empty and we should
1003 * just put a new leaf in if, or we have reached an empty child slot,
19baf839 1004 * and we should just put our new leaf in that.
c877efb2
SH
1005 * If we point to a T_TNODE, check if it matches our key. Note that
1006 * a T_TNODE might be skipping any number of bits - its 'pos' need
19baf839
RO
1007 * not be the parent's 'pos'+'bits'!
1008 *
c877efb2 1009 * If it does match the current key, get pos/bits from it, extract
19baf839
RO
1010 * the index from our key, push the T_TNODE and walk the tree.
1011 *
1012 * If it doesn't, we have to replace it with a new T_TNODE.
1013 *
c877efb2
SH
1014 * If we point to a T_LEAF, it might or might not have the same key
1015 * as we do. If it does, just change the value, update the T_LEAF's
1016 * value, and return it.
19baf839
RO
1017 * If it doesn't, we need to replace it with a T_TNODE.
1018 */
1019
1020 while (n != NULL && NODE_TYPE(n) == T_TNODE) {
1021 tn = (struct tnode *) n;
91b9a277 1022
c877efb2 1023 check_tnode(tn);
91b9a277 1024
c877efb2 1025 if (tkey_sub_equals(tn->key, pos, tn->pos-pos, key)) {
19baf839 1026 tp = tn;
91b9a277 1027 pos = tn->pos + tn->bits;
19baf839
RO
1028 n = tnode_get_child(tn, tkey_extract_bits(key, tn->pos, tn->bits));
1029
c877efb2 1030 if (n && NODE_PARENT(n) != tn) {
19baf839
RO
1031 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1032 BUG();
1033 }
91b9a277 1034 } else
19baf839
RO
1035 break;
1036 }
1037
1038 /*
1039 * n ----> NULL, LEAF or TNODE
1040 *
c877efb2 1041 * tp is n's (parent) ----> NULL or TNODE
19baf839
RO
1042 */
1043
91b9a277 1044 BUG_ON(tp && IS_LEAF(tp));
19baf839
RO
1045
1046 /* Case 1: n is a leaf. Compare prefixes */
1047
c877efb2 1048 if (n != NULL && IS_LEAF(n) && tkey_equals(key, n->key)) {
91b9a277
OJ
1049 struct leaf *l = (struct leaf *) n;
1050
19baf839 1051 li = leaf_info_new(plen);
91b9a277 1052
c877efb2 1053 if (!li) {
f835e471
RO
1054 *err = -ENOMEM;
1055 goto err;
1056 }
19baf839
RO
1057
1058 fa_head = &li->falh;
1059 insert_leaf_info(&l->list, li);
1060 goto done;
1061 }
1062 t->size++;
1063 l = leaf_new();
1064
c877efb2 1065 if (!l) {
f835e471
RO
1066 *err = -ENOMEM;
1067 goto err;
1068 }
19baf839
RO
1069
1070 l->key = key;
1071 li = leaf_info_new(plen);
1072
c877efb2 1073 if (!li) {
f835e471
RO
1074 tnode_free((struct tnode *) l);
1075 *err = -ENOMEM;
1076 goto err;
1077 }
19baf839
RO
1078
1079 fa_head = &li->falh;
1080 insert_leaf_info(&l->list, li);
1081
19baf839 1082 if (t->trie && n == NULL) {
91b9a277 1083 /* Case 2: n is NULL, and will just insert a new leaf */
19baf839
RO
1084
1085 NODE_SET_PARENT(l, tp);
19baf839 1086
91b9a277
OJ
1087 BUG_ON(!tp);
1088
1089 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1090 put_child(t, (struct tnode *)tp, cindex, (struct node *)l);
1091 } else {
1092 /* Case 3: n is a LEAF or a TNODE and the key doesn't match. */
c877efb2
SH
1093 /*
1094 * Add a new tnode here
19baf839
RO
1095 * first tnode need some special handling
1096 */
1097
1098 if (tp)
91b9a277 1099 pos = tp->pos+tp->bits;
19baf839 1100 else
91b9a277
OJ
1101 pos = 0;
1102
c877efb2 1103 if (n) {
19baf839
RO
1104 newpos = tkey_mismatch(key, pos, n->key);
1105 tn = tnode_new(n->key, newpos, 1);
91b9a277 1106 } else {
19baf839 1107 newpos = 0;
c877efb2 1108 tn = tnode_new(key, newpos, 1); /* First tnode */
19baf839 1109 }
19baf839 1110
c877efb2 1111 if (!tn) {
f835e471
RO
1112 free_leaf_info(li);
1113 tnode_free((struct tnode *) l);
1114 *err = -ENOMEM;
1115 goto err;
91b9a277
OJ
1116 }
1117
19baf839
RO
1118 NODE_SET_PARENT(tn, tp);
1119
91b9a277 1120 missbit = tkey_extract_bits(key, newpos, 1);
19baf839
RO
1121 put_child(t, tn, missbit, (struct node *)l);
1122 put_child(t, tn, 1-missbit, n);
1123
c877efb2 1124 if (tp) {
19baf839
RO
1125 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1126 put_child(t, (struct tnode *)tp, cindex, (struct node *)tn);
91b9a277 1127 } else {
19baf839
RO
1128 t->trie = (struct node*) tn; /* First tnode */
1129 tp = tn;
1130 }
1131 }
91b9a277
OJ
1132
1133 if (tp && tp->pos + tp->bits > 32)
c877efb2 1134 printk("ERROR tp=%p pos=%d, bits=%d, key=%0x plen=%d\n",
19baf839 1135 tp, tp->pos, tp->bits, key, plen);
91b9a277 1136
19baf839
RO
1137 /* Rebalance the trie */
1138 t->trie = trie_rebalance(t, tp);
f835e471
RO
1139done:
1140 t->revision++;
91b9a277 1141err:
19baf839
RO
1142 return fa_head;
1143}
1144
1145static int
1146fn_trie_insert(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
1147 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
1148{
1149 struct trie *t = (struct trie *) tb->tb_data;
1150 struct fib_alias *fa, *new_fa;
c877efb2 1151 struct list_head *fa_head = NULL;
19baf839
RO
1152 struct fib_info *fi;
1153 int plen = r->rtm_dst_len;
1154 int type = r->rtm_type;
1155 u8 tos = r->rtm_tos;
1156 u32 key, mask;
1157 int err;
1158 struct leaf *l;
1159
1160 if (plen > 32)
1161 return -EINVAL;
1162
1163 key = 0;
c877efb2 1164 if (rta->rta_dst)
19baf839
RO
1165 memcpy(&key, rta->rta_dst, 4);
1166
1167 key = ntohl(key);
1168
91b9a277 1169 DBG("Insert table=%d %08x/%d\n", tb->tb_id, key, plen);
19baf839 1170
91b9a277 1171 mask = ntohl(inet_make_mask(plen));
19baf839 1172
c877efb2 1173 if (key & ~mask)
19baf839
RO
1174 return -EINVAL;
1175
1176 key = key & mask;
1177
91b9a277
OJ
1178 fi = fib_create_info(r, rta, nlhdr, &err);
1179
1180 if (!fi)
19baf839
RO
1181 goto err;
1182
1183 l = fib_find_node(t, key);
c877efb2 1184 fa = NULL;
19baf839 1185
c877efb2 1186 if (l) {
19baf839
RO
1187 fa_head = get_fa_head(l, plen);
1188 fa = fib_find_alias(fa_head, tos, fi->fib_priority);
1189 }
1190
1191 /* Now fa, if non-NULL, points to the first fib alias
1192 * with the same keys [prefix,tos,priority], if such key already
1193 * exists or to the node before which we will insert new one.
1194 *
1195 * If fa is NULL, we will need to allocate a new one and
1196 * insert to the head of f.
1197 *
1198 * If f is NULL, no fib node matched the destination key
1199 * and we need to allocate a new one of those as well.
1200 */
1201
91b9a277 1202 if (fa && fa->fa_info->fib_priority == fi->fib_priority) {
19baf839
RO
1203 struct fib_alias *fa_orig;
1204
1205 err = -EEXIST;
1206 if (nlhdr->nlmsg_flags & NLM_F_EXCL)
1207 goto out;
1208
1209 if (nlhdr->nlmsg_flags & NLM_F_REPLACE) {
1210 struct fib_info *fi_drop;
1211 u8 state;
1212
1213 write_lock_bh(&fib_lock);
1214
1215 fi_drop = fa->fa_info;
1216 fa->fa_info = fi;
1217 fa->fa_type = type;
1218 fa->fa_scope = r->rtm_scope;
1219 state = fa->fa_state;
1220 fa->fa_state &= ~FA_S_ACCESSED;
1221
1222 write_unlock_bh(&fib_lock);
1223
1224 fib_release_info(fi_drop);
1225 if (state & FA_S_ACCESSED)
91b9a277 1226 rt_cache_flush(-1);
19baf839 1227
91b9a277 1228 goto succeeded;
19baf839
RO
1229 }
1230 /* Error if we find a perfect match which
1231 * uses the same scope, type, and nexthop
1232 * information.
1233 */
1234 fa_orig = fa;
1235 list_for_each_entry(fa, fa_orig->fa_list.prev, fa_list) {
1236 if (fa->fa_tos != tos)
1237 break;
1238 if (fa->fa_info->fib_priority != fi->fib_priority)
1239 break;
1240 if (fa->fa_type == type &&
1241 fa->fa_scope == r->rtm_scope &&
1242 fa->fa_info == fi) {
1243 goto out;
1244 }
1245 }
1246 if (!(nlhdr->nlmsg_flags & NLM_F_APPEND))
1247 fa = fa_orig;
1248 }
1249 err = -ENOENT;
91b9a277 1250 if (!(nlhdr->nlmsg_flags & NLM_F_CREATE))
19baf839
RO
1251 goto out;
1252
1253 err = -ENOBUFS;
1254 new_fa = kmem_cache_alloc(fn_alias_kmem, SLAB_KERNEL);
1255 if (new_fa == NULL)
1256 goto out;
1257
1258 new_fa->fa_info = fi;
1259 new_fa->fa_tos = tos;
1260 new_fa->fa_type = type;
1261 new_fa->fa_scope = r->rtm_scope;
1262 new_fa->fa_state = 0;
19baf839
RO
1263 /*
1264 * Insert new entry to the list.
1265 */
1266
c877efb2 1267 if (!fa_head) {
f835e471
RO
1268 fa_head = fib_insert_node(t, &err, key, plen);
1269 err = 0;
c877efb2 1270 if (err)
f835e471
RO
1271 goto out_free_new_fa;
1272 }
19baf839
RO
1273
1274 write_lock_bh(&fib_lock);
1275
91b9a277 1276 list_add_tail(&new_fa->fa_list, (fa ? &fa->fa_list : fa_head));
19baf839
RO
1277
1278 write_unlock_bh(&fib_lock);
1279
1280 rt_cache_flush(-1);
1281 rtmsg_fib(RTM_NEWROUTE, htonl(key), new_fa, plen, tb->tb_id, nlhdr, req);
1282succeeded:
1283 return 0;
f835e471
RO
1284
1285out_free_new_fa:
1286 kmem_cache_free(fn_alias_kmem, new_fa);
19baf839
RO
1287out:
1288 fib_release_info(fi);
91b9a277 1289err:
19baf839
RO
1290 return err;
1291}
1292
c877efb2 1293static inline int check_leaf(struct trie *t, struct leaf *l, t_key key, int *plen, const struct flowi *flp,
06c74270 1294 struct fib_result *res)
19baf839 1295{
06c74270 1296 int err, i;
19baf839
RO
1297 t_key mask;
1298 struct leaf_info *li;
1299 struct hlist_head *hhead = &l->list;
1300 struct hlist_node *node;
c877efb2 1301
19baf839 1302 hlist_for_each_entry(li, node, hhead, hlist) {
19baf839
RO
1303 i = li->plen;
1304 mask = ntohl(inet_make_mask(i));
c877efb2 1305 if (l->key != (key & mask))
19baf839
RO
1306 continue;
1307
06c74270 1308 if ((err = fib_semantic_match(&li->falh, flp, res, l->key, mask, i)) <= 0) {
19baf839
RO
1309 *plen = i;
1310#ifdef CONFIG_IP_FIB_TRIE_STATS
1311 t->stats.semantic_match_passed++;
1312#endif
06c74270 1313 return err;
19baf839
RO
1314 }
1315#ifdef CONFIG_IP_FIB_TRIE_STATS
1316 t->stats.semantic_match_miss++;
1317#endif
1318 }
06c74270 1319 return 1;
19baf839
RO
1320}
1321
1322static int
1323fn_trie_lookup(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1324{
1325 struct trie *t = (struct trie *) tb->tb_data;
1326 int plen, ret = 0;
1327 struct node *n;
1328 struct tnode *pn;
1329 int pos, bits;
91b9a277 1330 t_key key = ntohl(flp->fl4_dst);
19baf839
RO
1331 int chopped_off;
1332 t_key cindex = 0;
1333 int current_prefix_length = KEYLENGTH;
91b9a277
OJ
1334 struct tnode *cn;
1335 t_key node_prefix, key_prefix, pref_mismatch;
1336 int mp;
1337
19baf839
RO
1338 n = t->trie;
1339
1340 read_lock(&fib_lock);
91b9a277 1341
c877efb2 1342 if (!n)
19baf839
RO
1343 goto failed;
1344
1345#ifdef CONFIG_IP_FIB_TRIE_STATS
1346 t->stats.gets++;
1347#endif
1348
1349 /* Just a leaf? */
1350 if (IS_LEAF(n)) {
06c74270 1351 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
19baf839
RO
1352 goto found;
1353 goto failed;
1354 }
1355 pn = (struct tnode *) n;
1356 chopped_off = 0;
c877efb2 1357
91b9a277 1358 while (pn) {
19baf839
RO
1359 pos = pn->pos;
1360 bits = pn->bits;
1361
c877efb2 1362 if (!chopped_off)
19baf839
RO
1363 cindex = tkey_extract_bits(MASK_PFX(key, current_prefix_length), pos, bits);
1364
1365 n = tnode_get_child(pn, cindex);
1366
1367 if (n == NULL) {
1368#ifdef CONFIG_IP_FIB_TRIE_STATS
1369 t->stats.null_node_hit++;
1370#endif
1371 goto backtrace;
1372 }
1373
91b9a277
OJ
1374 if (IS_LEAF(n)) {
1375 if ((ret = check_leaf(t, (struct leaf *)n, key, &plen, flp, res)) <= 0)
1376 goto found;
1377 else
1378 goto backtrace;
1379 }
1380
19baf839
RO
1381#define HL_OPTIMIZE
1382#ifdef HL_OPTIMIZE
91b9a277 1383 cn = (struct tnode *)n;
19baf839 1384
91b9a277
OJ
1385 /*
1386 * It's a tnode, and we can do some extra checks here if we
1387 * like, to avoid descending into a dead-end branch.
1388 * This tnode is in the parent's child array at index
1389 * key[p_pos..p_pos+p_bits] but potentially with some bits
1390 * chopped off, so in reality the index may be just a
1391 * subprefix, padded with zero at the end.
1392 * We can also take a look at any skipped bits in this
1393 * tnode - everything up to p_pos is supposed to be ok,
1394 * and the non-chopped bits of the index (se previous
1395 * paragraph) are also guaranteed ok, but the rest is
1396 * considered unknown.
1397 *
1398 * The skipped bits are key[pos+bits..cn->pos].
1399 */
19baf839 1400
91b9a277
OJ
1401 /* If current_prefix_length < pos+bits, we are already doing
1402 * actual prefix matching, which means everything from
1403 * pos+(bits-chopped_off) onward must be zero along some
1404 * branch of this subtree - otherwise there is *no* valid
1405 * prefix present. Here we can only check the skipped
1406 * bits. Remember, since we have already indexed into the
1407 * parent's child array, we know that the bits we chopped of
1408 * *are* zero.
1409 */
19baf839 1410
91b9a277 1411 /* NOTA BENE: CHECKING ONLY SKIPPED BITS FOR THE NEW NODE HERE */
19baf839 1412
91b9a277
OJ
1413 if (current_prefix_length < pos+bits) {
1414 if (tkey_extract_bits(cn->key, current_prefix_length,
1415 cn->pos - current_prefix_length) != 0 ||
1416 !(cn->child[0]))
1417 goto backtrace;
1418 }
19baf839 1419
91b9a277
OJ
1420 /*
1421 * If chopped_off=0, the index is fully validated and we
1422 * only need to look at the skipped bits for this, the new,
1423 * tnode. What we actually want to do is to find out if
1424 * these skipped bits match our key perfectly, or if we will
1425 * have to count on finding a matching prefix further down,
1426 * because if we do, we would like to have some way of
1427 * verifying the existence of such a prefix at this point.
1428 */
19baf839 1429
91b9a277
OJ
1430 /* The only thing we can do at this point is to verify that
1431 * any such matching prefix can indeed be a prefix to our
1432 * key, and if the bits in the node we are inspecting that
1433 * do not match our key are not ZERO, this cannot be true.
1434 * Thus, find out where there is a mismatch (before cn->pos)
1435 * and verify that all the mismatching bits are zero in the
1436 * new tnode's key.
1437 */
19baf839 1438
91b9a277
OJ
1439 /* Note: We aren't very concerned about the piece of the key
1440 * that precede pn->pos+pn->bits, since these have already been
1441 * checked. The bits after cn->pos aren't checked since these are
1442 * by definition "unknown" at this point. Thus, what we want to
1443 * see is if we are about to enter the "prefix matching" state,
1444 * and in that case verify that the skipped bits that will prevail
1445 * throughout this subtree are zero, as they have to be if we are
1446 * to find a matching prefix.
1447 */
1448
1449 node_prefix = MASK_PFX(cn->key, cn->pos);
1450 key_prefix = MASK_PFX(key, cn->pos);
1451 pref_mismatch = key_prefix^node_prefix;
1452 mp = 0;
1453
1454 /* In short: If skipped bits in this node do not match the search
1455 * key, enter the "prefix matching" state.directly.
1456 */
1457 if (pref_mismatch) {
1458 while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) {
1459 mp++;
1460 pref_mismatch = pref_mismatch <<1;
1461 }
1462 key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp);
1463
1464 if (key_prefix != 0)
1465 goto backtrace;
1466
1467 if (current_prefix_length >= cn->pos)
1468 current_prefix_length = mp;
c877efb2 1469 }
91b9a277
OJ
1470#endif
1471 pn = (struct tnode *)n; /* Descend */
1472 chopped_off = 0;
1473 continue;
1474
19baf839
RO
1475backtrace:
1476 chopped_off++;
1477
1478 /* As zero don't change the child key (cindex) */
91b9a277 1479 while ((chopped_off <= pn->bits) && !(cindex & (1<<(chopped_off-1))))
19baf839 1480 chopped_off++;
19baf839
RO
1481
1482 /* Decrease current_... with bits chopped off */
1483 if (current_prefix_length > pn->pos + pn->bits - chopped_off)
1484 current_prefix_length = pn->pos + pn->bits - chopped_off;
91b9a277 1485
19baf839 1486 /*
c877efb2 1487 * Either we do the actual chop off according or if we have
19baf839
RO
1488 * chopped off all bits in this tnode walk up to our parent.
1489 */
1490
91b9a277 1491 if (chopped_off <= pn->bits) {
19baf839 1492 cindex &= ~(1 << (chopped_off-1));
91b9a277 1493 } else {
c877efb2 1494 if (NODE_PARENT(pn) == NULL)
19baf839 1495 goto failed;
91b9a277 1496
19baf839
RO
1497 /* Get Child's index */
1498 cindex = tkey_extract_bits(pn->key, NODE_PARENT(pn)->pos, NODE_PARENT(pn)->bits);
1499 pn = NODE_PARENT(pn);
1500 chopped_off = 0;
1501
1502#ifdef CONFIG_IP_FIB_TRIE_STATS
1503 t->stats.backtrack++;
1504#endif
1505 goto backtrace;
c877efb2 1506 }
19baf839
RO
1507 }
1508failed:
c877efb2 1509 ret = 1;
19baf839
RO
1510found:
1511 read_unlock(&fib_lock);
1512 return ret;
1513}
1514
1515static int trie_leaf_remove(struct trie *t, t_key key)
1516{
1517 t_key cindex;
1518 struct tnode *tp = NULL;
1519 struct node *n = t->trie;
1520 struct leaf *l;
1521
91b9a277 1522 DBG("entering trie_leaf_remove(%p)\n", n);
19baf839
RO
1523
1524 /* Note that in the case skipped bits, those bits are *not* checked!
c877efb2 1525 * When we finish this, we will have NULL or a T_LEAF, and the
19baf839
RO
1526 * T_LEAF may or may not match our key.
1527 */
1528
91b9a277 1529 while (n != NULL && IS_TNODE(n)) {
19baf839
RO
1530 struct tnode *tn = (struct tnode *) n;
1531 check_tnode(tn);
1532 n = tnode_get_child(tn ,tkey_extract_bits(key, tn->pos, tn->bits));
1533
91b9a277
OJ
1534 if (n && NODE_PARENT(n) != tn) {
1535 printk("BUG tn=%p, n->parent=%p\n", tn, NODE_PARENT(n));
1536 BUG();
1537 }
1538 }
19baf839
RO
1539 l = (struct leaf *) n;
1540
c877efb2 1541 if (!n || !tkey_equals(l->key, key))
19baf839 1542 return 0;
c877efb2
SH
1543
1544 /*
1545 * Key found.
1546 * Remove the leaf and rebalance the tree
19baf839
RO
1547 */
1548
1549 t->revision++;
1550 t->size--;
1551
1552 tp = NODE_PARENT(n);
1553 tnode_free((struct tnode *) n);
1554
c877efb2 1555 if (tp) {
19baf839
RO
1556 cindex = tkey_extract_bits(key, tp->pos, tp->bits);
1557 put_child(t, (struct tnode *)tp, cindex, NULL);
1558 t->trie = trie_rebalance(t, tp);
91b9a277 1559 } else
19baf839
RO
1560 t->trie = NULL;
1561
1562 return 1;
1563}
1564
1565static int
1566fn_trie_delete(struct fib_table *tb, struct rtmsg *r, struct kern_rta *rta,
91b9a277 1567 struct nlmsghdr *nlhdr, struct netlink_skb_parms *req)
19baf839
RO
1568{
1569 struct trie *t = (struct trie *) tb->tb_data;
1570 u32 key, mask;
1571 int plen = r->rtm_dst_len;
1572 u8 tos = r->rtm_tos;
1573 struct fib_alias *fa, *fa_to_delete;
1574 struct list_head *fa_head;
1575 struct leaf *l;
91b9a277
OJ
1576 int kill_li = 0;
1577 struct leaf_info *li;
1578
19baf839 1579
c877efb2 1580 if (plen > 32)
19baf839
RO
1581 return -EINVAL;
1582
1583 key = 0;
c877efb2 1584 if (rta->rta_dst)
19baf839
RO
1585 memcpy(&key, rta->rta_dst, 4);
1586
1587 key = ntohl(key);
91b9a277 1588 mask = ntohl(inet_make_mask(plen));
19baf839 1589
c877efb2 1590 if (key & ~mask)
19baf839
RO
1591 return -EINVAL;
1592
1593 key = key & mask;
1594 l = fib_find_node(t, key);
1595
c877efb2 1596 if (!l)
19baf839
RO
1597 return -ESRCH;
1598
1599 fa_head = get_fa_head(l, plen);
1600 fa = fib_find_alias(fa_head, tos, 0);
1601
1602 if (!fa)
1603 return -ESRCH;
1604
91b9a277 1605 DBG("Deleting %08x/%d tos=%d t=%p\n", key, plen, tos, t);
19baf839
RO
1606
1607 fa_to_delete = NULL;
1608 fa_head = fa->fa_list.prev;
1609 list_for_each_entry(fa, fa_head, fa_list) {
1610 struct fib_info *fi = fa->fa_info;
1611
1612 if (fa->fa_tos != tos)
1613 break;
1614
1615 if ((!r->rtm_type ||
1616 fa->fa_type == r->rtm_type) &&
1617 (r->rtm_scope == RT_SCOPE_NOWHERE ||
1618 fa->fa_scope == r->rtm_scope) &&
1619 (!r->rtm_protocol ||
1620 fi->fib_protocol == r->rtm_protocol) &&
1621 fib_nh_match(r, nlhdr, rta, fi) == 0) {
1622 fa_to_delete = fa;
1623 break;
1624 }
1625 }
1626
91b9a277
OJ
1627 if (!fa_to_delete)
1628 return -ESRCH;
19baf839 1629
91b9a277
OJ
1630 fa = fa_to_delete;
1631 rtmsg_fib(RTM_DELROUTE, htonl(key), fa, plen, tb->tb_id, nlhdr, req);
1632
1633 l = fib_find_node(t, key);
1634 li = find_leaf_info(&l->list, plen);
19baf839 1635
91b9a277 1636 write_lock_bh(&fib_lock);
19baf839 1637
91b9a277 1638 list_del(&fa->fa_list);
19baf839 1639
91b9a277
OJ
1640 if (list_empty(fa_head)) {
1641 hlist_del(&li->hlist);
1642 kill_li = 1;
1643 }
1644 write_unlock_bh(&fib_lock);
19baf839 1645
91b9a277
OJ
1646 if (kill_li)
1647 free_leaf_info(li);
19baf839 1648
91b9a277
OJ
1649 if (hlist_empty(&l->list))
1650 trie_leaf_remove(t, key);
19baf839 1651
91b9a277
OJ
1652 if (fa->fa_state & FA_S_ACCESSED)
1653 rt_cache_flush(-1);
19baf839 1654
91b9a277
OJ
1655 fn_free_alias(fa);
1656 return 0;
19baf839
RO
1657}
1658
1659static int trie_flush_list(struct trie *t, struct list_head *head)
1660{
1661 struct fib_alias *fa, *fa_node;
1662 int found = 0;
1663
1664 list_for_each_entry_safe(fa, fa_node, head, fa_list) {
1665 struct fib_info *fi = fa->fa_info;
19baf839 1666
91b9a277 1667 if (fi && (fi->fib_flags&RTNH_F_DEAD)) {
c877efb2 1668 write_lock_bh(&fib_lock);
19baf839 1669 list_del(&fa->fa_list);
c877efb2 1670 write_unlock_bh(&fib_lock);
19baf839
RO
1671
1672 fn_free_alias(fa);
1673 found++;
1674 }
1675 }
1676 return found;
1677}
1678
1679static int trie_flush_leaf(struct trie *t, struct leaf *l)
1680{
1681 int found = 0;
1682 struct hlist_head *lih = &l->list;
1683 struct hlist_node *node, *tmp;
1684 struct leaf_info *li = NULL;
1685
1686 hlist_for_each_entry_safe(li, node, tmp, lih, hlist) {
19baf839
RO
1687 found += trie_flush_list(t, &li->falh);
1688
1689 if (list_empty(&li->falh)) {
c877efb2 1690 write_lock_bh(&fib_lock);
19baf839 1691 hlist_del(&li->hlist);
c877efb2 1692 write_unlock_bh(&fib_lock);
19baf839
RO
1693
1694 free_leaf_info(li);
1695 }
1696 }
1697 return found;
1698}
1699
1700static struct leaf *nextleaf(struct trie *t, struct leaf *thisleaf)
1701{
1702 struct node *c = (struct node *) thisleaf;
1703 struct tnode *p;
1704 int idx;
1705
c877efb2
SH
1706 if (c == NULL) {
1707 if (t->trie == NULL)
19baf839
RO
1708 return NULL;
1709
1710 if (IS_LEAF(t->trie)) /* trie w. just a leaf */
1711 return (struct leaf *) t->trie;
1712
1713 p = (struct tnode*) t->trie; /* Start */
91b9a277 1714 } else
19baf839 1715 p = (struct tnode *) NODE_PARENT(c);
c877efb2 1716
19baf839
RO
1717 while (p) {
1718 int pos, last;
1719
1720 /* Find the next child of the parent */
c877efb2
SH
1721 if (c)
1722 pos = 1 + tkey_extract_bits(c->key, p->pos, p->bits);
1723 else
19baf839
RO
1724 pos = 0;
1725
1726 last = 1 << p->bits;
91b9a277
OJ
1727 for (idx = pos; idx < last ; idx++) {
1728 if (!p->child[idx])
1729 continue;
1730
1731 /* Decend if tnode */
1732 while (IS_TNODE(p->child[idx])) {
1733 p = (struct tnode*) p->child[idx];
1734 idx = 0;
1735
1736 /* Rightmost non-NULL branch */
1737 if (p && IS_TNODE(p))
1738 while (p->child[idx] == NULL && idx < (1 << p->bits)) idx++;
1739
1740 /* Done with this tnode? */
1741 if (idx >= (1 << p->bits) || p->child[idx] == NULL)
1742 goto up;
19baf839 1743 }
91b9a277 1744 return (struct leaf*) p->child[idx];
19baf839
RO
1745 }
1746up:
1747 /* No more children go up one step */
91b9a277 1748 c = (struct node *) p;
19baf839
RO
1749 p = (struct tnode *) NODE_PARENT(p);
1750 }
1751 return NULL; /* Ready. Root of trie */
1752}
1753
1754static int fn_trie_flush(struct fib_table *tb)
1755{
1756 struct trie *t = (struct trie *) tb->tb_data;
1757 struct leaf *ll = NULL, *l = NULL;
1758 int found = 0, h;
1759
1760 t->revision++;
1761
91b9a277 1762 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
19baf839
RO
1763 found += trie_flush_leaf(t, l);
1764
1765 if (ll && hlist_empty(&ll->list))
1766 trie_leaf_remove(t, ll->key);
1767 ll = l;
1768 }
1769
1770 if (ll && hlist_empty(&ll->list))
1771 trie_leaf_remove(t, ll->key);
1772
91b9a277 1773 DBG("trie_flush found=%d\n", found);
19baf839
RO
1774 return found;
1775}
1776
91b9a277 1777static int trie_last_dflt = -1;
19baf839
RO
1778
1779static void
1780fn_trie_select_default(struct fib_table *tb, const struct flowi *flp, struct fib_result *res)
1781{
1782 struct trie *t = (struct trie *) tb->tb_data;
1783 int order, last_idx;
1784 struct fib_info *fi = NULL;
1785 struct fib_info *last_resort;
1786 struct fib_alias *fa = NULL;
1787 struct list_head *fa_head;
1788 struct leaf *l;
1789
1790 last_idx = -1;
1791 last_resort = NULL;
1792 order = -1;
1793
1794 read_lock(&fib_lock);
c877efb2 1795
19baf839 1796 l = fib_find_node(t, 0);
c877efb2 1797 if (!l)
19baf839
RO
1798 goto out;
1799
1800 fa_head = get_fa_head(l, 0);
c877efb2 1801 if (!fa_head)
19baf839
RO
1802 goto out;
1803
c877efb2 1804 if (list_empty(fa_head))
19baf839
RO
1805 goto out;
1806
1807 list_for_each_entry(fa, fa_head, fa_list) {
1808 struct fib_info *next_fi = fa->fa_info;
91b9a277 1809
19baf839
RO
1810 if (fa->fa_scope != res->scope ||
1811 fa->fa_type != RTN_UNICAST)
1812 continue;
91b9a277 1813
19baf839
RO
1814 if (next_fi->fib_priority > res->fi->fib_priority)
1815 break;
1816 if (!next_fi->fib_nh[0].nh_gw ||
1817 next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1818 continue;
1819 fa->fa_state |= FA_S_ACCESSED;
91b9a277 1820
19baf839
RO
1821 if (fi == NULL) {
1822 if (next_fi != res->fi)
1823 break;
1824 } else if (!fib_detect_death(fi, order, &last_resort,
1825 &last_idx, &trie_last_dflt)) {
1826 if (res->fi)
1827 fib_info_put(res->fi);
1828 res->fi = fi;
1829 atomic_inc(&fi->fib_clntref);
1830 trie_last_dflt = order;
1831 goto out;
1832 }
1833 fi = next_fi;
1834 order++;
1835 }
1836 if (order <= 0 || fi == NULL) {
1837 trie_last_dflt = -1;
1838 goto out;
1839 }
1840
1841 if (!fib_detect_death(fi, order, &last_resort, &last_idx, &trie_last_dflt)) {
1842 if (res->fi)
1843 fib_info_put(res->fi);
1844 res->fi = fi;
1845 atomic_inc(&fi->fib_clntref);
1846 trie_last_dflt = order;
1847 goto out;
1848 }
1849 if (last_idx >= 0) {
1850 if (res->fi)
1851 fib_info_put(res->fi);
1852 res->fi = last_resort;
1853 if (last_resort)
1854 atomic_inc(&last_resort->fib_clntref);
1855 }
1856 trie_last_dflt = last_idx;
1857 out:;
c877efb2 1858 read_unlock(&fib_lock);
19baf839
RO
1859}
1860
c877efb2 1861static int fn_trie_dump_fa(t_key key, int plen, struct list_head *fah, struct fib_table *tb,
19baf839
RO
1862 struct sk_buff *skb, struct netlink_callback *cb)
1863{
1864 int i, s_i;
1865 struct fib_alias *fa;
1866
91b9a277 1867 u32 xkey = htonl(key);
19baf839 1868
91b9a277 1869 s_i = cb->args[3];
19baf839
RO
1870 i = 0;
1871
1872 list_for_each_entry(fa, fah, fa_list) {
1873 if (i < s_i) {
1874 i++;
1875 continue;
1876 }
1877 if (fa->fa_info->fib_nh == NULL) {
1878 printk("Trie error _fib_nh=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1879 i++;
1880 continue;
1881 }
1882 if (fa->fa_info == NULL) {
1883 printk("Trie error fa_info=NULL in fa[%d] k=%08x plen=%d\n", i, key, plen);
1884 i++;
1885 continue;
1886 }
1887
1888 if (fib_dump_info(skb, NETLINK_CB(cb->skb).pid,
1889 cb->nlh->nlmsg_seq,
1890 RTM_NEWROUTE,
1891 tb->tb_id,
1892 fa->fa_type,
1893 fa->fa_scope,
1894 &xkey,
1895 plen,
1896 fa->fa_tos,
90f66914 1897 fa->fa_info, 0) < 0) {
19baf839
RO
1898 cb->args[3] = i;
1899 return -1;
91b9a277 1900 }
19baf839
RO
1901 i++;
1902 }
91b9a277 1903 cb->args[3] = i;
19baf839
RO
1904 return skb->len;
1905}
1906
c877efb2 1907static int fn_trie_dump_plen(struct trie *t, int plen, struct fib_table *tb, struct sk_buff *skb,
19baf839
RO
1908 struct netlink_callback *cb)
1909{
1910 int h, s_h;
1911 struct list_head *fa_head;
1912 struct leaf *l = NULL;
19baf839 1913
91b9a277 1914 s_h = cb->args[2];
19baf839 1915
91b9a277 1916 for (h = 0; (l = nextleaf(t, l)) != NULL; h++) {
19baf839
RO
1917 if (h < s_h)
1918 continue;
1919 if (h > s_h)
1920 memset(&cb->args[3], 0,
1921 sizeof(cb->args) - 3*sizeof(cb->args[0]));
1922
1923 fa_head = get_fa_head(l, plen);
91b9a277 1924
c877efb2 1925 if (!fa_head)
19baf839
RO
1926 continue;
1927
c877efb2 1928 if (list_empty(fa_head))
19baf839
RO
1929 continue;
1930
1931 if (fn_trie_dump_fa(l->key, plen, fa_head, tb, skb, cb)<0) {
91b9a277 1932 cb->args[2] = h;
19baf839
RO
1933 return -1;
1934 }
1935 }
91b9a277 1936 cb->args[2] = h;
19baf839
RO
1937 return skb->len;
1938}
1939
1940static int fn_trie_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb)
1941{
1942 int m, s_m;
1943 struct trie *t = (struct trie *) tb->tb_data;
1944
1945 s_m = cb->args[1];
1946
1947 read_lock(&fib_lock);
91b9a277 1948 for (m = 0; m <= 32; m++) {
19baf839
RO
1949 if (m < s_m)
1950 continue;
1951 if (m > s_m)
1952 memset(&cb->args[2], 0,
91b9a277 1953 sizeof(cb->args) - 2*sizeof(cb->args[0]));
19baf839
RO
1954
1955 if (fn_trie_dump_plen(t, 32-m, tb, skb, cb)<0) {
1956 cb->args[1] = m;
1957 goto out;
1958 }
1959 }
1960 read_unlock(&fib_lock);
1961 cb->args[1] = m;
1962 return skb->len;
91b9a277 1963out:
19baf839
RO
1964 read_unlock(&fib_lock);
1965 return -1;
1966}
1967
1968/* Fix more generic FIB names for init later */
1969
1970#ifdef CONFIG_IP_MULTIPLE_TABLES
1971struct fib_table * fib_hash_init(int id)
1972#else
1973struct fib_table * __init fib_hash_init(int id)
1974#endif
1975{
1976 struct fib_table *tb;
1977 struct trie *t;
1978
1979 if (fn_alias_kmem == NULL)
1980 fn_alias_kmem = kmem_cache_create("ip_fib_alias",
1981 sizeof(struct fib_alias),
1982 0, SLAB_HWCACHE_ALIGN,
1983 NULL, NULL);
1984
1985 tb = kmalloc(sizeof(struct fib_table) + sizeof(struct trie),
1986 GFP_KERNEL);
1987 if (tb == NULL)
1988 return NULL;
1989
1990 tb->tb_id = id;
1991 tb->tb_lookup = fn_trie_lookup;
1992 tb->tb_insert = fn_trie_insert;
1993 tb->tb_delete = fn_trie_delete;
1994 tb->tb_flush = fn_trie_flush;
1995 tb->tb_select_default = fn_trie_select_default;
1996 tb->tb_dump = fn_trie_dump;
1997 memset(tb->tb_data, 0, sizeof(struct trie));
1998
1999 t = (struct trie *) tb->tb_data;
2000
2001 trie_init(t);
2002
c877efb2 2003 if (id == RT_TABLE_LOCAL)
91b9a277 2004 trie_local = t;
c877efb2 2005 else if (id == RT_TABLE_MAIN)
91b9a277 2006 trie_main = t;
19baf839
RO
2007
2008 if (id == RT_TABLE_LOCAL)
2009 printk("IPv4 FIB: Using LC-trie version %s\n", VERSION);
2010
2011 return tb;
2012}
2013
2014/* Trie dump functions */
2015
2016static void putspace_seq(struct seq_file *seq, int n)
2017{
91b9a277
OJ
2018 while (n--)
2019 seq_printf(seq, " ");
19baf839
RO
2020}
2021
2022static void printbin_seq(struct seq_file *seq, unsigned int v, int bits)
2023{
2024 while (bits--)
2025 seq_printf(seq, "%s", (v & (1<<bits))?"1":"0");
2026}
2027
c877efb2 2028static void printnode_seq(struct seq_file *seq, int indent, struct node *n,
19baf839
RO
2029 int pend, int cindex, int bits)
2030{
2031 putspace_seq(seq, indent);
2032 if (IS_LEAF(n))
2033 seq_printf(seq, "|");
2034 else
2035 seq_printf(seq, "+");
2036 if (bits) {
2037 seq_printf(seq, "%d/", cindex);
2038 printbin_seq(seq, cindex, bits);
2039 seq_printf(seq, ": ");
91b9a277 2040 } else
19baf839
RO
2041 seq_printf(seq, "<root>: ");
2042 seq_printf(seq, "%s:%p ", IS_LEAF(n)?"Leaf":"Internal node", n);
2043
19baf839 2044 if (IS_LEAF(n)) {
91b9a277 2045 struct leaf *l = (struct leaf *)n;
19baf839
RO
2046 struct fib_alias *fa;
2047 int i;
91b9a277
OJ
2048
2049 seq_printf(seq, "key=%d.%d.%d.%d\n",
2050 n->key >> 24, (n->key >> 16) % 256, (n->key >> 8) % 256, n->key % 256);
2051
2052 for (i = 32; i >= 0; i--)
2053 if (find_leaf_info(&l->list, i)) {
19baf839 2054 struct list_head *fa_head = get_fa_head(l, i);
91b9a277 2055
c877efb2 2056 if (!fa_head)
19baf839
RO
2057 continue;
2058
c877efb2 2059 if (list_empty(fa_head))
19baf839
RO
2060 continue;
2061
2062 putspace_seq(seq, indent+2);
2063 seq_printf(seq, "{/%d...dumping}\n", i);
2064
19baf839
RO
2065 list_for_each_entry(fa, fa_head, fa_list) {
2066 putspace_seq(seq, indent+2);
19baf839
RO
2067 if (fa->fa_info == NULL) {
2068 seq_printf(seq, "Error fa_info=NULL\n");
2069 continue;
2070 }
91b9a277
OJ
2071 if (fa->fa_info->fib_nh == NULL) {
2072 seq_printf(seq, "Error _fib_nh=NULL\n");
2073 continue;
2074 }
19baf839
RO
2075
2076 seq_printf(seq, "{type=%d scope=%d TOS=%d}\n",
2077 fa->fa_type,
2078 fa->fa_scope,
2079 fa->fa_tos);
2080 }
2081 }
91b9a277 2082 } else {
c877efb2 2083 struct tnode *tn = (struct tnode *)n;
91b9a277
OJ
2084 int plen = ((struct tnode *)n)->pos;
2085 t_key prf = MASK_PFX(n->key, plen);
2086
2087 seq_printf(seq, "key=%d.%d.%d.%d/%d\n",
2088 prf >> 24, (prf >> 16) % 256, (prf >> 8) % 256, prf % 256, plen);
2089
19baf839 2090 putspace_seq(seq, indent); seq_printf(seq, "| ");
91b9a277 2091 seq_printf(seq, "{key prefix=%08x/", tn->key & TKEY_GET_MASK(0, tn->pos));
19baf839
RO
2092 printbin_seq(seq, tkey_extract_bits(tn->key, 0, tn->pos), tn->pos);
2093 seq_printf(seq, "}\n");
2094 putspace_seq(seq, indent); seq_printf(seq, "| ");
2095 seq_printf(seq, "{pos=%d", tn->pos);
2096 seq_printf(seq, " (skip=%d bits)", tn->pos - pend);
2097 seq_printf(seq, " bits=%d (%u children)}\n", tn->bits, (1 << tn->bits));
2098 putspace_seq(seq, indent); seq_printf(seq, "| ");
2099 seq_printf(seq, "{empty=%d full=%d}\n", tn->empty_children, tn->full_children);
2100 }
2101}
2102
2103static void trie_dump_seq(struct seq_file *seq, struct trie *t)
2104{
c877efb2 2105 struct node *n = t->trie;
91b9a277
OJ
2106 int cindex = 0;
2107 int indent = 1;
2108 int pend = 0;
19baf839 2109 int depth = 0;
91b9a277 2110 struct tnode *tn;
19baf839
RO
2111
2112 read_lock(&fib_lock);
2113
2114 seq_printf(seq, "------ trie_dump of t=%p ------\n", t);
19baf839 2115
91b9a277
OJ
2116 if (!n) {
2117 seq_printf(seq, "------ trie is empty\n");
2118
2119 read_unlock(&fib_lock);
2120 return;
2121 }
2122
2123 printnode_seq(seq, indent, n, pend, cindex, 0);
2124
2125 if (!IS_TNODE(n)) {
2126 read_unlock(&fib_lock);
2127 return;
2128 }
2129
2130 tn = (struct tnode *)n;
2131 pend = tn->pos+tn->bits;
2132 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2133 indent += 3;
2134 depth++;
2135
2136 while (tn && cindex < (1 << tn->bits)) {
2137 if (tn->child[cindex]) {
2138 /* Got a child */
2139
2140 printnode_seq(seq, indent, tn->child[cindex], pend, cindex, tn->bits);
2141 if (IS_LEAF(tn->child[cindex])) {
2142 cindex++;
2143 } else {
19baf839 2144 /*
91b9a277 2145 * New tnode. Decend one level
19baf839 2146 */
19baf839 2147
91b9a277
OJ
2148 depth++;
2149 tn = (struct tnode *)tn->child[cindex];
2150 pend = tn->pos + tn->bits;
2151 putspace_seq(seq, indent); seq_printf(seq, "\\--\n");
2152 indent += 3;
2153 cindex = 0;
19baf839 2154 }
91b9a277
OJ
2155 } else
2156 cindex++;
2157
2158 /*
2159 * Test if we are done
2160 */
2161
2162 while (cindex >= (1 << tn->bits)) {
2163 /*
2164 * Move upwards and test for root
2165 * pop off all traversed nodes
2166 */
2167
2168 if (NODE_PARENT(tn) == NULL) {
2169 tn = NULL;
2170 break;
2171 }
2172
2173 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2174 cindex++;
2175 tn = NODE_PARENT(tn);
2176 pend = tn->pos + tn->bits;
2177 indent -= 3;
2178 depth--;
19baf839 2179 }
19baf839 2180 }
19baf839
RO
2181
2182 read_unlock(&fib_lock);
2183}
2184
2185static struct trie_stat *trie_stat_new(void)
2186{
91b9a277 2187 struct trie_stat *s;
19baf839 2188 int i;
c877efb2 2189
91b9a277
OJ
2190 s = kmalloc(sizeof(struct trie_stat), GFP_KERNEL);
2191 if (!s)
2192 return NULL;
2193
2194 s->totdepth = 0;
2195 s->maxdepth = 0;
2196 s->tnodes = 0;
2197 s->leaves = 0;
2198 s->nullpointers = 0;
2199
2200 for (i = 0; i < MAX_CHILDS; i++)
2201 s->nodesizes[i] = 0;
2202
19baf839 2203 return s;
c877efb2 2204}
19baf839
RO
2205
2206static struct trie_stat *trie_collect_stats(struct trie *t)
2207{
c877efb2 2208 struct node *n = t->trie;
19baf839
RO
2209 struct trie_stat *s = trie_stat_new();
2210 int cindex = 0;
19baf839
RO
2211 int pend = 0;
2212 int depth = 0;
2213
91b9a277
OJ
2214 if (!s)
2215 return NULL;
2216 if (!n)
2217 return s;
19baf839 2218
91b9a277 2219 read_lock(&fib_lock);
19baf839 2220
91b9a277
OJ
2221 if (IS_TNODE(n)) {
2222 struct tnode *tn = (struct tnode *)n;
2223 pend = tn->pos+tn->bits;
2224 s->nodesizes[tn->bits]++;
2225 depth++;
2226
2227 while (tn && cindex < (1 << tn->bits)) {
2228 if (tn->child[cindex]) {
2229 /* Got a child */
19baf839 2230
91b9a277
OJ
2231 if (IS_LEAF(tn->child[cindex])) {
2232 cindex++;
2233
2234 /* stats */
2235 if (depth > s->maxdepth)
2236 s->maxdepth = depth;
2237 s->totdepth += depth;
2238 s->leaves++;
2239 } else {
19baf839 2240 /*
91b9a277 2241 * New tnode. Decend one level
19baf839 2242 */
91b9a277
OJ
2243
2244 s->tnodes++;
2245 s->nodesizes[tn->bits]++;
2246 depth++;
2247
2248 n = tn->child[cindex];
2249 tn = (struct tnode *)n;
2250 pend = tn->pos+tn->bits;
2251
2252 cindex = 0;
19baf839 2253 }
91b9a277
OJ
2254 } else {
2255 cindex++;
2256 s->nullpointers++;
19baf839 2257 }
91b9a277
OJ
2258
2259 /*
2260 * Test if we are done
2261 */
2262
2263 while (cindex >= (1 << tn->bits)) {
2264 /*
2265 * Move upwards and test for root
2266 * pop off all traversed nodes
2267 */
2268
2269 if (NODE_PARENT(tn) == NULL) {
2270 tn = NULL;
2271 n = NULL;
2272 break;
2273 }
2274
2275 cindex = tkey_extract_bits(tn->key, NODE_PARENT(tn)->pos, NODE_PARENT(tn)->bits);
2276 tn = NODE_PARENT(tn);
2277 cindex++;
2278 n = (struct node *)tn;
2279 pend = tn->pos+tn->bits;
2280 depth--;
2281 }
19baf839
RO
2282 }
2283 }
2284
91b9a277 2285 read_unlock(&fib_lock);
19baf839
RO
2286 return s;
2287}
2288
2289#ifdef CONFIG_PROC_FS
2290
2291static struct fib_alias *fib_triestat_get_first(struct seq_file *seq)
2292{
2293 return NULL;
2294}
2295
2296static struct fib_alias *fib_triestat_get_next(struct seq_file *seq)
2297{
2298 return NULL;
2299}
2300
2301static void *fib_triestat_seq_start(struct seq_file *seq, loff_t *pos)
2302{
91b9a277
OJ
2303 if (!ip_fib_main_table)
2304 return NULL;
19baf839 2305
91b9a277
OJ
2306 if (*pos)
2307 return fib_triestat_get_next(seq);
2308 else
2309 return SEQ_START_TOKEN;
19baf839
RO
2310}
2311
2312static void *fib_triestat_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2313{
2314 ++*pos;
91b9a277
OJ
2315 if (v == SEQ_START_TOKEN)
2316 return fib_triestat_get_first(seq);
2317 else
2318 return fib_triestat_get_next(seq);
19baf839
RO
2319}
2320
2321static void fib_triestat_seq_stop(struct seq_file *seq, void *v)
2322{
2323
2324}
2325
c877efb2 2326/*
19baf839
RO
2327 * This outputs /proc/net/fib_triestats
2328 *
2329 * It always works in backward compatibility mode.
2330 * The format of the file is not supposed to be changed.
2331 */
2332
2333static void collect_and_show(struct trie *t, struct seq_file *seq)
2334{
2335 int bytes = 0; /* How many bytes are used, a ref is 4 bytes */
2336 int i, max, pointers;
91b9a277 2337 struct trie_stat *stat;
19baf839
RO
2338 int avdepth;
2339
2340 stat = trie_collect_stats(t);
2341
91b9a277 2342 bytes = 0;
19baf839
RO
2343 seq_printf(seq, "trie=%p\n", t);
2344
2345 if (stat) {
2346 if (stat->leaves)
91b9a277 2347 avdepth = stat->totdepth*100 / stat->leaves;
19baf839 2348 else
91b9a277
OJ
2349 avdepth = 0;
2350 seq_printf(seq, "Aver depth: %d.%02d\n", avdepth / 100, avdepth % 100);
19baf839 2351 seq_printf(seq, "Max depth: %4d\n", stat->maxdepth);
91b9a277 2352
19baf839
RO
2353 seq_printf(seq, "Leaves: %d\n", stat->leaves);
2354 bytes += sizeof(struct leaf) * stat->leaves;
2355 seq_printf(seq, "Internal nodes: %d\n", stat->tnodes);
2356 bytes += sizeof(struct tnode) * stat->tnodes;
2357
2358 max = MAX_CHILDS-1;
2359
2360 while (max >= 0 && stat->nodesizes[max] == 0)
2361 max--;
2362 pointers = 0;
2363
c877efb2 2364 for (i = 1; i <= max; i++)
19baf839
RO
2365 if (stat->nodesizes[i] != 0) {
2366 seq_printf(seq, " %d: %d", i, stat->nodesizes[i]);
2367 pointers += (1<<i) * stat->nodesizes[i];
2368 }
2369 seq_printf(seq, "\n");
2370 seq_printf(seq, "Pointers: %d\n", pointers);
2371 bytes += sizeof(struct node *) * pointers;
2372 seq_printf(seq, "Null ptrs: %d\n", stat->nullpointers);
2373 seq_printf(seq, "Total size: %d kB\n", bytes / 1024);
2374
2375 kfree(stat);
2376 }
2377
2378#ifdef CONFIG_IP_FIB_TRIE_STATS
2379 seq_printf(seq, "Counters:\n---------\n");
2380 seq_printf(seq,"gets = %d\n", t->stats.gets);
2381 seq_printf(seq,"backtracks = %d\n", t->stats.backtrack);
2382 seq_printf(seq,"semantic match passed = %d\n", t->stats.semantic_match_passed);
2383 seq_printf(seq,"semantic match miss = %d\n", t->stats.semantic_match_miss);
2384 seq_printf(seq,"null node hit= %d\n", t->stats.null_node_hit);
2f36895a 2385 seq_printf(seq,"skipped node resize = %d\n", t->stats.resize_node_skipped);
19baf839
RO
2386#ifdef CLEAR_STATS
2387 memset(&(t->stats), 0, sizeof(t->stats));
2388#endif
2389#endif /* CONFIG_IP_FIB_TRIE_STATS */
2390}
2391
2392static int fib_triestat_seq_show(struct seq_file *seq, void *v)
2393{
2394 char bf[128];
c877efb2 2395
19baf839 2396 if (v == SEQ_START_TOKEN) {
c877efb2 2397 seq_printf(seq, "Basic info: size of leaf: %Zd bytes, size of tnode: %Zd bytes.\n",
19baf839 2398 sizeof(struct leaf), sizeof(struct tnode));
c877efb2 2399 if (trie_local)
19baf839
RO
2400 collect_and_show(trie_local, seq);
2401
c877efb2 2402 if (trie_main)
19baf839 2403 collect_and_show(trie_main, seq);
91b9a277
OJ
2404 } else {
2405 snprintf(bf, sizeof(bf), "*\t%08X\t%08X", 200, 400);
2406
19baf839
RO
2407 seq_printf(seq, "%-127s\n", bf);
2408 }
2409 return 0;
2410}
2411
2412static struct seq_operations fib_triestat_seq_ops = {
c877efb2
SH
2413 .start = fib_triestat_seq_start,
2414 .next = fib_triestat_seq_next,
2415 .stop = fib_triestat_seq_stop,
2416 .show = fib_triestat_seq_show,
19baf839
RO
2417};
2418
2419static int fib_triestat_seq_open(struct inode *inode, struct file *file)
2420{
2421 struct seq_file *seq;
2422 int rc = -ENOMEM;
2423
2424 rc = seq_open(file, &fib_triestat_seq_ops);
2425 if (rc)
2426 goto out_kfree;
2427
c877efb2 2428 seq = file->private_data;
19baf839
RO
2429out:
2430 return rc;
2431out_kfree:
2432 goto out;
2433}
2434
2435static struct file_operations fib_triestat_seq_fops = {
c877efb2
SH
2436 .owner = THIS_MODULE,
2437 .open = fib_triestat_seq_open,
2438 .read = seq_read,
2439 .llseek = seq_lseek,
2440 .release = seq_release_private,
19baf839
RO
2441};
2442
2443int __init fib_stat_proc_init(void)
2444{
2445 if (!proc_net_fops_create("fib_triestat", S_IRUGO, &fib_triestat_seq_fops))
2446 return -ENOMEM;
2447 return 0;
2448}
2449
2450void __init fib_stat_proc_exit(void)
2451{
2452 proc_net_remove("fib_triestat");
2453}
2454
2455static struct fib_alias *fib_trie_get_first(struct seq_file *seq)
2456{
2457 return NULL;
2458}
2459
2460static struct fib_alias *fib_trie_get_next(struct seq_file *seq)
2461{
2462 return NULL;
2463}
2464
2465static void *fib_trie_seq_start(struct seq_file *seq, loff_t *pos)
2466{
91b9a277
OJ
2467 if (!ip_fib_main_table)
2468 return NULL;
19baf839 2469
91b9a277
OJ
2470 if (*pos)
2471 return fib_trie_get_next(seq);
2472 else
2473 return SEQ_START_TOKEN;
19baf839
RO
2474}
2475
2476static void *fib_trie_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2477{
2478 ++*pos;
91b9a277
OJ
2479 if (v == SEQ_START_TOKEN)
2480 return fib_trie_get_first(seq);
2481 else
2482 return fib_trie_get_next(seq);
2483
19baf839
RO
2484}
2485
2486static void fib_trie_seq_stop(struct seq_file *seq, void *v)
2487{
19baf839
RO
2488}
2489
c877efb2 2490/*
19baf839
RO
2491 * This outputs /proc/net/fib_trie.
2492 *
2493 * It always works in backward compatibility mode.
2494 * The format of the file is not supposed to be changed.
2495 */
2496
2497static int fib_trie_seq_show(struct seq_file *seq, void *v)
2498{
2499 char bf[128];
2500
2501 if (v == SEQ_START_TOKEN) {
c877efb2 2502 if (trie_local)
19baf839
RO
2503 trie_dump_seq(seq, trie_local);
2504
c877efb2 2505 if (trie_main)
19baf839 2506 trie_dump_seq(seq, trie_main);
91b9a277 2507 } else {
19baf839
RO
2508 snprintf(bf, sizeof(bf),
2509 "*\t%08X\t%08X", 200, 400);
2510 seq_printf(seq, "%-127s\n", bf);
2511 }
2512
2513 return 0;
2514}
2515
2516static struct seq_operations fib_trie_seq_ops = {
c877efb2
SH
2517 .start = fib_trie_seq_start,
2518 .next = fib_trie_seq_next,
2519 .stop = fib_trie_seq_stop,
2520 .show = fib_trie_seq_show,
19baf839
RO
2521};
2522
2523static int fib_trie_seq_open(struct inode *inode, struct file *file)
2524{
2525 struct seq_file *seq;
2526 int rc = -ENOMEM;
2527
2528 rc = seq_open(file, &fib_trie_seq_ops);
2529 if (rc)
2530 goto out_kfree;
2531
c877efb2 2532 seq = file->private_data;
19baf839
RO
2533out:
2534 return rc;
2535out_kfree:
2536 goto out;
2537}
2538
2539static struct file_operations fib_trie_seq_fops = {
c877efb2
SH
2540 .owner = THIS_MODULE,
2541 .open = fib_trie_seq_open,
2542 .read = seq_read,
2543 .llseek = seq_lseek,
2544 .release= seq_release_private,
19baf839
RO
2545};
2546
2547int __init fib_proc_init(void)
2548{
2549 if (!proc_net_fops_create("fib_trie", S_IRUGO, &fib_trie_seq_fops))
2550 return -ENOMEM;
2551 return 0;
2552}
2553
2554void __init fib_proc_exit(void)
2555{
2556 proc_net_remove("fib_trie");
2557}
2558
2559#endif /* CONFIG_PROC_FS */