]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/fib_semantics.c
ipsec: pfkey should ignore events when no listeners
[net-next-2.6.git] / net / ipv4 / fib_semantics.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * IPv4 Forwarding Information Base: semantics.
7 *
8 * Version: $Id: fib_semantics.c,v 1.19 2002/01/12 07:54:56 davem Exp $
9 *
10 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 */
17
1da177e4
LT
18#include <asm/uaccess.h>
19#include <asm/system.h>
20#include <linux/bitops.h>
21#include <linux/types.h>
22#include <linux/kernel.h>
23#include <linux/jiffies.h>
24#include <linux/mm.h>
25#include <linux/string.h>
26#include <linux/socket.h>
27#include <linux/sockios.h>
28#include <linux/errno.h>
29#include <linux/in.h>
30#include <linux/inet.h>
14c85021 31#include <linux/inetdevice.h>
1da177e4
LT
32#include <linux/netdevice.h>
33#include <linux/if_arp.h>
34#include <linux/proc_fs.h>
35#include <linux/skbuff.h>
1da177e4
LT
36#include <linux/init.h>
37
14c85021 38#include <net/arp.h>
1da177e4
LT
39#include <net/ip.h>
40#include <net/protocol.h>
41#include <net/route.h>
42#include <net/tcp.h>
43#include <net/sock.h>
44#include <net/ip_fib.h>
f21c7bc5 45#include <net/netlink.h>
4e902c57 46#include <net/nexthop.h>
1da177e4
LT
47
48#include "fib_lookup.h"
49
832b4c5e 50static DEFINE_SPINLOCK(fib_info_lock);
1da177e4
LT
51static struct hlist_head *fib_info_hash;
52static struct hlist_head *fib_info_laddrhash;
53static unsigned int fib_hash_size;
54static unsigned int fib_info_cnt;
55
56#define DEVINDEX_HASHBITS 8
57#define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
58static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
59
60#ifdef CONFIG_IP_ROUTE_MULTIPATH
61
62static DEFINE_SPINLOCK(fib_multipath_lock);
63
64#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
65for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
66
67#define change_nexthops(fi) { int nhsel; struct fib_nh * nh; \
68for (nhsel=0, nh = (struct fib_nh*)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nh++, nhsel++)
69
70#else /* CONFIG_IP_ROUTE_MULTIPATH */
71
72/* Hope, that gcc will optimize it to get rid of dummy loop */
73
74#define for_nexthops(fi) { int nhsel=0; const struct fib_nh * nh = (fi)->fib_nh; \
75for (nhsel=0; nhsel < 1; nhsel++)
76
77#define change_nexthops(fi) { int nhsel=0; struct fib_nh * nh = (struct fib_nh*)((fi)->fib_nh); \
78for (nhsel=0; nhsel < 1; nhsel++)
79
80#endif /* CONFIG_IP_ROUTE_MULTIPATH */
81
82#define endfor_nexthops(fi) }
83
84
e905a9ed 85static const struct
1da177e4
LT
86{
87 int error;
88 u8 scope;
a0ee18b9 89} fib_props[RTN_MAX + 1] = {
e905a9ed 90 {
1da177e4
LT
91 .error = 0,
92 .scope = RT_SCOPE_NOWHERE,
93 }, /* RTN_UNSPEC */
94 {
95 .error = 0,
96 .scope = RT_SCOPE_UNIVERSE,
97 }, /* RTN_UNICAST */
98 {
99 .error = 0,
100 .scope = RT_SCOPE_HOST,
101 }, /* RTN_LOCAL */
102 {
103 .error = 0,
104 .scope = RT_SCOPE_LINK,
105 }, /* RTN_BROADCAST */
106 {
107 .error = 0,
108 .scope = RT_SCOPE_LINK,
109 }, /* RTN_ANYCAST */
110 {
111 .error = 0,
112 .scope = RT_SCOPE_UNIVERSE,
113 }, /* RTN_MULTICAST */
114 {
115 .error = -EINVAL,
116 .scope = RT_SCOPE_UNIVERSE,
117 }, /* RTN_BLACKHOLE */
118 {
119 .error = -EHOSTUNREACH,
120 .scope = RT_SCOPE_UNIVERSE,
121 }, /* RTN_UNREACHABLE */
122 {
123 .error = -EACCES,
124 .scope = RT_SCOPE_UNIVERSE,
125 }, /* RTN_PROHIBIT */
126 {
127 .error = -EAGAIN,
128 .scope = RT_SCOPE_UNIVERSE,
129 }, /* RTN_THROW */
130 {
131 .error = -EINVAL,
132 .scope = RT_SCOPE_NOWHERE,
133 }, /* RTN_NAT */
134 {
135 .error = -EINVAL,
136 .scope = RT_SCOPE_NOWHERE,
137 }, /* RTN_XRESOLVE */
138};
139
140
141/* Release a nexthop info record */
142
143void free_fib_info(struct fib_info *fi)
144{
145 if (fi->fib_dead == 0) {
a6db9010 146 printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
1da177e4
LT
147 return;
148 }
149 change_nexthops(fi) {
150 if (nh->nh_dev)
151 dev_put(nh->nh_dev);
152 nh->nh_dev = NULL;
153 } endfor_nexthops(fi);
154 fib_info_cnt--;
57d7a600 155 release_net(fi->fib_net);
1da177e4
LT
156 kfree(fi);
157}
158
159void fib_release_info(struct fib_info *fi)
160{
832b4c5e 161 spin_lock_bh(&fib_info_lock);
1da177e4
LT
162 if (fi && --fi->fib_treeref == 0) {
163 hlist_del(&fi->fib_hash);
164 if (fi->fib_prefsrc)
165 hlist_del(&fi->fib_lhash);
166 change_nexthops(fi) {
167 if (!nh->nh_dev)
168 continue;
169 hlist_del(&nh->nh_hash);
170 } endfor_nexthops(fi)
171 fi->fib_dead = 1;
172 fib_info_put(fi);
173 }
832b4c5e 174 spin_unlock_bh(&fib_info_lock);
1da177e4
LT
175}
176
177static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
178{
179 const struct fib_nh *onh = ofi->fib_nh;
180
181 for_nexthops(fi) {
182 if (nh->nh_oif != onh->nh_oif ||
183 nh->nh_gw != onh->nh_gw ||
184 nh->nh_scope != onh->nh_scope ||
185#ifdef CONFIG_IP_ROUTE_MULTIPATH
186 nh->nh_weight != onh->nh_weight ||
187#endif
188#ifdef CONFIG_NET_CLS_ROUTE
189 nh->nh_tclassid != onh->nh_tclassid ||
190#endif
191 ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
192 return -1;
193 onh++;
194 } endfor_nexthops(fi);
195 return 0;
196}
197
88ebc72f
DM
198static inline unsigned int fib_devindex_hashfn(unsigned int val)
199{
200 unsigned int mask = DEVINDEX_HASHSIZE - 1;
201
202 return (val ^
203 (val >> DEVINDEX_HASHBITS) ^
204 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
205}
206
1da177e4
LT
207static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
208{
209 unsigned int mask = (fib_hash_size - 1);
210 unsigned int val = fi->fib_nhs;
211
212 val ^= fi->fib_protocol;
81f7bf6c 213 val ^= (__force u32)fi->fib_prefsrc;
1da177e4 214 val ^= fi->fib_priority;
88ebc72f
DM
215 for_nexthops(fi) {
216 val ^= fib_devindex_hashfn(nh->nh_oif);
217 } endfor_nexthops(fi)
1da177e4
LT
218
219 return (val ^ (val >> 7) ^ (val >> 12)) & mask;
220}
221
222static struct fib_info *fib_find_info(const struct fib_info *nfi)
223{
224 struct hlist_head *head;
225 struct hlist_node *node;
226 struct fib_info *fi;
227 unsigned int hash;
228
229 hash = fib_info_hashfn(nfi);
230 head = &fib_info_hash[hash];
231
232 hlist_for_each_entry(fi, node, head, fib_hash) {
4814bdbd
DL
233 if (fi->fib_net != nfi->fib_net)
234 continue;
1da177e4
LT
235 if (fi->fib_nhs != nfi->fib_nhs)
236 continue;
237 if (nfi->fib_protocol == fi->fib_protocol &&
238 nfi->fib_prefsrc == fi->fib_prefsrc &&
239 nfi->fib_priority == fi->fib_priority &&
240 memcmp(nfi->fib_metrics, fi->fib_metrics,
241 sizeof(fi->fib_metrics)) == 0 &&
242 ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
243 (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
244 return fi;
245 }
246
247 return NULL;
248}
249
1da177e4
LT
250/* Check, that the gateway is already configured.
251 Used only by redirect accept routine.
252 */
253
d878e72e 254int ip_fib_check_default(__be32 gw, struct net_device *dev)
1da177e4
LT
255{
256 struct hlist_head *head;
257 struct hlist_node *node;
258 struct fib_nh *nh;
259 unsigned int hash;
260
832b4c5e 261 spin_lock(&fib_info_lock);
1da177e4
LT
262
263 hash = fib_devindex_hashfn(dev->ifindex);
264 head = &fib_info_devhash[hash];
265 hlist_for_each_entry(nh, node, head, nh_hash) {
266 if (nh->nh_dev == dev &&
267 nh->nh_gw == gw &&
268 !(nh->nh_flags&RTNH_F_DEAD)) {
832b4c5e 269 spin_unlock(&fib_info_lock);
1da177e4
LT
270 return 0;
271 }
272 }
273
832b4c5e 274 spin_unlock(&fib_info_lock);
1da177e4
LT
275
276 return -1;
277}
278
339bf98f
TG
279static inline size_t fib_nlmsg_size(struct fib_info *fi)
280{
281 size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
282 + nla_total_size(4) /* RTA_TABLE */
283 + nla_total_size(4) /* RTA_DST */
284 + nla_total_size(4) /* RTA_PRIORITY */
285 + nla_total_size(4); /* RTA_PREFSRC */
286
287 /* space for nested metrics */
288 payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
289
290 if (fi->fib_nhs) {
291 /* Also handles the special case fib_nhs == 1 */
292
293 /* each nexthop is packed in an attribute */
294 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
295
296 /* may contain flow and gateway attribute */
297 nhsize += 2 * nla_total_size(4);
298
299 /* all nexthops are packed in a nested attribute */
300 payload += nla_total_size(fi->fib_nhs * nhsize);
301 }
302
303 return payload;
304}
305
81f7bf6c 306void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
b8f55831
MK
307 int dst_len, u32 tb_id, struct nl_info *info,
308 unsigned int nlm_flags)
1da177e4
LT
309{
310 struct sk_buff *skb;
4e902c57 311 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
f21c7bc5 312 int err = -ENOBUFS;
1da177e4 313
339bf98f 314 skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
f21c7bc5
TG
315 if (skb == NULL)
316 goto errout;
1da177e4 317
4e902c57 318 err = fib_dump_info(skb, info->pid, seq, event, tb_id,
be403ea1 319 fa->fa_type, fa->fa_scope, key, dst_len,
b8f55831 320 fa->fa_tos, fa->fa_info, nlm_flags);
26932566
PM
321 if (err < 0) {
322 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
323 WARN_ON(err == -EMSGSIZE);
324 kfree_skb(skb);
325 goto errout;
326 }
4d1169c1 327 err = rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
4e902c57 328 info->nlh, GFP_KERNEL);
f21c7bc5
TG
329errout:
330 if (err < 0)
4d1169c1 331 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
1da177e4
LT
332}
333
334/* Return the first fib alias matching TOS with
335 * priority less than or equal to PRIO.
336 */
337struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
338{
339 if (fah) {
340 struct fib_alias *fa;
341 list_for_each_entry(fa, fah, fa_list) {
342 if (fa->fa_tos > tos)
343 continue;
344 if (fa->fa_info->fib_priority >= prio ||
345 fa->fa_tos < tos)
346 return fa;
347 }
348 }
349 return NULL;
350}
351
352int fib_detect_death(struct fib_info *fi, int order,
c17860a0 353 struct fib_info **last_resort, int *last_idx, int dflt)
1da177e4
LT
354{
355 struct neighbour *n;
356 int state = NUD_NONE;
357
358 n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
359 if (n) {
360 state = n->nud_state;
361 neigh_release(n);
362 }
363 if (state==NUD_REACHABLE)
364 return 0;
c17860a0 365 if ((state&NUD_VALID) && order != dflt)
1da177e4
LT
366 return 0;
367 if ((state&NUD_VALID) ||
c17860a0 368 (*last_idx<0 && order > dflt)) {
1da177e4
LT
369 *last_resort = fi;
370 *last_idx = order;
371 }
372 return 1;
373}
374
375#ifdef CONFIG_IP_ROUTE_MULTIPATH
376
4e902c57 377static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
1da177e4
LT
378{
379 int nhs = 0;
1da177e4 380
4e902c57 381 while (rtnh_ok(rtnh, remaining)) {
1da177e4 382 nhs++;
4e902c57
TG
383 rtnh = rtnh_next(rtnh, &remaining);
384 }
385
386 /* leftover implies invalid nexthop configuration, discard it */
387 return remaining > 0 ? 0 : nhs;
1da177e4
LT
388}
389
4e902c57
TG
390static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
391 int remaining, struct fib_config *cfg)
1da177e4 392{
1da177e4 393 change_nexthops(fi) {
4e902c57
TG
394 int attrlen;
395
396 if (!rtnh_ok(rtnh, remaining))
1da177e4 397 return -EINVAL;
4e902c57
TG
398
399 nh->nh_flags = (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
400 nh->nh_oif = rtnh->rtnh_ifindex;
401 nh->nh_weight = rtnh->rtnh_hops + 1;
402
403 attrlen = rtnh_attrlen(rtnh);
404 if (attrlen > 0) {
405 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
406
407 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
17fb2c64 408 nh->nh_gw = nla ? nla_get_be32(nla) : 0;
1da177e4 409#ifdef CONFIG_NET_CLS_ROUTE
4e902c57
TG
410 nla = nla_find(attrs, attrlen, RTA_FLOW);
411 nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
1da177e4
LT
412#endif
413 }
4e902c57
TG
414
415 rtnh = rtnh_next(rtnh, &remaining);
1da177e4 416 } endfor_nexthops(fi);
4e902c57 417
1da177e4
LT
418 return 0;
419}
420
421#endif
422
4e902c57 423int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
1da177e4
LT
424{
425#ifdef CONFIG_IP_ROUTE_MULTIPATH
4e902c57
TG
426 struct rtnexthop *rtnh;
427 int remaining;
1da177e4
LT
428#endif
429
4e902c57 430 if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
1da177e4
LT
431 return 1;
432
4e902c57
TG
433 if (cfg->fc_oif || cfg->fc_gw) {
434 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
435 (!cfg->fc_gw || cfg->fc_gw == fi->fib_nh->nh_gw))
1da177e4
LT
436 return 0;
437 return 1;
438 }
439
440#ifdef CONFIG_IP_ROUTE_MULTIPATH
4e902c57 441 if (cfg->fc_mp == NULL)
1da177e4 442 return 0;
4e902c57
TG
443
444 rtnh = cfg->fc_mp;
445 remaining = cfg->fc_mp_len;
e905a9ed 446
1da177e4 447 for_nexthops(fi) {
4e902c57 448 int attrlen;
1da177e4 449
4e902c57 450 if (!rtnh_ok(rtnh, remaining))
1da177e4 451 return -EINVAL;
4e902c57
TG
452
453 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
1da177e4 454 return 1;
4e902c57
TG
455
456 attrlen = rtnh_attrlen(rtnh);
457 if (attrlen < 0) {
458 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
459
460 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
17fb2c64 461 if (nla && nla_get_be32(nla) != nh->nh_gw)
1da177e4
LT
462 return 1;
463#ifdef CONFIG_NET_CLS_ROUTE
4e902c57
TG
464 nla = nla_find(attrs, attrlen, RTA_FLOW);
465 if (nla && nla_get_u32(nla) != nh->nh_tclassid)
1da177e4
LT
466 return 1;
467#endif
468 }
4e902c57
TG
469
470 rtnh = rtnh_next(rtnh, &remaining);
1da177e4
LT
471 } endfor_nexthops(fi);
472#endif
473 return 0;
474}
475
476
477/*
478 Picture
479 -------
480
481 Semantics of nexthop is very messy by historical reasons.
482 We have to take into account, that:
483 a) gateway can be actually local interface address,
484 so that gatewayed route is direct.
485 b) gateway must be on-link address, possibly
486 described not by an ifaddr, but also by a direct route.
487 c) If both gateway and interface are specified, they should not
488 contradict.
489 d) If we use tunnel routes, gateway could be not on-link.
490
491 Attempt to reconcile all of these (alas, self-contradictory) conditions
492 results in pretty ugly and hairy code with obscure logic.
493
494 I chose to generalized it instead, so that the size
495 of code does not increase practically, but it becomes
496 much more general.
497 Every prefix is assigned a "scope" value: "host" is local address,
498 "link" is direct route,
499 [ ... "site" ... "interior" ... ]
500 and "universe" is true gateway route with global meaning.
501
502 Every prefix refers to a set of "nexthop"s (gw, oif),
503 where gw must have narrower scope. This recursion stops
504 when gw has LOCAL scope or if "nexthop" is declared ONLINK,
505 which means that gw is forced to be on link.
506
507 Code is still hairy, but now it is apparently logically
508 consistent and very flexible. F.e. as by-product it allows
509 to co-exists in peace independent exterior and interior
510 routing processes.
511
512 Normally it looks as following.
513
514 {universe prefix} -> (gw, oif) [scope link]
e905a9ed 515 |
1da177e4 516 |-> {link prefix} -> (gw, oif) [scope local]
e905a9ed 517 |
1da177e4
LT
518 |-> {local prefix} (terminal node)
519 */
520
4e902c57
TG
521static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
522 struct fib_nh *nh)
1da177e4
LT
523{
524 int err;
86167a37 525 struct net *net;
1da177e4 526
86167a37 527 net = cfg->fc_nlinfo.nl_net;
1da177e4
LT
528 if (nh->nh_gw) {
529 struct fib_result res;
530
531#ifdef CONFIG_IP_ROUTE_PERVASIVE
532 if (nh->nh_flags&RTNH_F_PERVASIVE)
533 return 0;
534#endif
535 if (nh->nh_flags&RTNH_F_ONLINK) {
536 struct net_device *dev;
537
4e902c57 538 if (cfg->fc_scope >= RT_SCOPE_LINK)
1da177e4 539 return -EINVAL;
86167a37 540 if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
1da177e4 541 return -EINVAL;
86167a37 542 if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
1da177e4
LT
543 return -ENODEV;
544 if (!(dev->flags&IFF_UP))
545 return -ENETDOWN;
546 nh->nh_dev = dev;
547 dev_hold(dev);
548 nh->nh_scope = RT_SCOPE_LINK;
549 return 0;
550 }
551 {
4e902c57
TG
552 struct flowi fl = {
553 .nl_u = {
554 .ip4_u = {
555 .daddr = nh->nh_gw,
556 .scope = cfg->fc_scope + 1,
557 },
558 },
559 .oif = nh->nh_oif,
560 };
1da177e4
LT
561
562 /* It is not necessary, but requires a bit of thinking */
563 if (fl.fl4_scope < RT_SCOPE_LINK)
564 fl.fl4_scope = RT_SCOPE_LINK;
86167a37 565 if ((err = fib_lookup(net, &fl, &res)) != 0)
1da177e4
LT
566 return err;
567 }
568 err = -EINVAL;
569 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
570 goto out;
571 nh->nh_scope = res.scope;
572 nh->nh_oif = FIB_RES_OIF(res);
573 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
574 goto out;
575 dev_hold(nh->nh_dev);
576 err = -ENETDOWN;
577 if (!(nh->nh_dev->flags & IFF_UP))
578 goto out;
579 err = 0;
580out:
581 fib_res_put(&res);
582 return err;
583 } else {
584 struct in_device *in_dev;
585
586 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
587 return -EINVAL;
588
86167a37 589 in_dev = inetdev_by_index(net, nh->nh_oif);
1da177e4
LT
590 if (in_dev == NULL)
591 return -ENODEV;
592 if (!(in_dev->dev->flags&IFF_UP)) {
593 in_dev_put(in_dev);
594 return -ENETDOWN;
595 }
596 nh->nh_dev = in_dev->dev;
597 dev_hold(nh->nh_dev);
598 nh->nh_scope = RT_SCOPE_HOST;
599 in_dev_put(in_dev);
600 }
601 return 0;
602}
603
81f7bf6c 604static inline unsigned int fib_laddr_hashfn(__be32 val)
1da177e4
LT
605{
606 unsigned int mask = (fib_hash_size - 1);
607
81f7bf6c 608 return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
1da177e4
LT
609}
610
611static struct hlist_head *fib_hash_alloc(int bytes)
612{
613 if (bytes <= PAGE_SIZE)
88f83491 614 return kzalloc(bytes, GFP_KERNEL);
1da177e4
LT
615 else
616 return (struct hlist_head *)
88f83491 617 __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
1da177e4
LT
618}
619
620static void fib_hash_free(struct hlist_head *hash, int bytes)
621{
622 if (!hash)
623 return;
624
625 if (bytes <= PAGE_SIZE)
626 kfree(hash);
627 else
628 free_pages((unsigned long) hash, get_order(bytes));
629}
630
631static void fib_hash_move(struct hlist_head *new_info_hash,
632 struct hlist_head *new_laddrhash,
633 unsigned int new_size)
634{
b7656e7f 635 struct hlist_head *old_info_hash, *old_laddrhash;
1da177e4 636 unsigned int old_size = fib_hash_size;
b7656e7f 637 unsigned int i, bytes;
1da177e4 638
832b4c5e 639 spin_lock_bh(&fib_info_lock);
b7656e7f
DM
640 old_info_hash = fib_info_hash;
641 old_laddrhash = fib_info_laddrhash;
1da177e4
LT
642 fib_hash_size = new_size;
643
644 for (i = 0; i < old_size; i++) {
645 struct hlist_head *head = &fib_info_hash[i];
646 struct hlist_node *node, *n;
647 struct fib_info *fi;
648
649 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
650 struct hlist_head *dest;
651 unsigned int new_hash;
652
653 hlist_del(&fi->fib_hash);
654
655 new_hash = fib_info_hashfn(fi);
656 dest = &new_info_hash[new_hash];
657 hlist_add_head(&fi->fib_hash, dest);
658 }
659 }
660 fib_info_hash = new_info_hash;
661
662 for (i = 0; i < old_size; i++) {
663 struct hlist_head *lhead = &fib_info_laddrhash[i];
664 struct hlist_node *node, *n;
665 struct fib_info *fi;
666
667 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
668 struct hlist_head *ldest;
669 unsigned int new_hash;
670
671 hlist_del(&fi->fib_lhash);
672
673 new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
674 ldest = &new_laddrhash[new_hash];
675 hlist_add_head(&fi->fib_lhash, ldest);
676 }
677 }
678 fib_info_laddrhash = new_laddrhash;
679
832b4c5e 680 spin_unlock_bh(&fib_info_lock);
b7656e7f
DM
681
682 bytes = old_size * sizeof(struct hlist_head *);
683 fib_hash_free(old_info_hash, bytes);
684 fib_hash_free(old_laddrhash, bytes);
1da177e4
LT
685}
686
4e902c57 687struct fib_info *fib_create_info(struct fib_config *cfg)
1da177e4
LT
688{
689 int err;
690 struct fib_info *fi = NULL;
691 struct fib_info *ofi;
1da177e4 692 int nhs = 1;
7462bd74 693 struct net *net = cfg->fc_nlinfo.nl_net;
1da177e4
LT
694
695 /* Fast check to catch the most weird cases */
4e902c57 696 if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
1da177e4
LT
697 goto err_inval;
698
699#ifdef CONFIG_IP_ROUTE_MULTIPATH
4e902c57
TG
700 if (cfg->fc_mp) {
701 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
1da177e4
LT
702 if (nhs == 0)
703 goto err_inval;
704 }
705#endif
1da177e4
LT
706
707 err = -ENOBUFS;
708 if (fib_info_cnt >= fib_hash_size) {
709 unsigned int new_size = fib_hash_size << 1;
710 struct hlist_head *new_info_hash;
711 struct hlist_head *new_laddrhash;
712 unsigned int bytes;
713
714 if (!new_size)
715 new_size = 1;
716 bytes = new_size * sizeof(struct hlist_head *);
717 new_info_hash = fib_hash_alloc(bytes);
718 new_laddrhash = fib_hash_alloc(bytes);
719 if (!new_info_hash || !new_laddrhash) {
720 fib_hash_free(new_info_hash, bytes);
721 fib_hash_free(new_laddrhash, bytes);
88f83491 722 } else
1da177e4 723 fib_hash_move(new_info_hash, new_laddrhash, new_size);
1da177e4
LT
724
725 if (!fib_hash_size)
726 goto failure;
727 }
728
0da974f4 729 fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
1da177e4
LT
730 if (fi == NULL)
731 goto failure;
732 fib_info_cnt++;
1da177e4 733
57d7a600 734 fi->fib_net = hold_net(net);
4e902c57
TG
735 fi->fib_protocol = cfg->fc_protocol;
736 fi->fib_flags = cfg->fc_flags;
737 fi->fib_priority = cfg->fc_priority;
738 fi->fib_prefsrc = cfg->fc_prefsrc;
1da177e4
LT
739
740 fi->fib_nhs = nhs;
741 change_nexthops(fi) {
742 nh->nh_parent = fi;
743 } endfor_nexthops(fi)
744
4e902c57
TG
745 if (cfg->fc_mx) {
746 struct nlattr *nla;
747 int remaining;
748
749 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
8f4c1f9b 750 int type = nla_type(nla);
4e902c57
TG
751
752 if (type) {
753 if (type > RTAX_MAX)
1da177e4 754 goto err_inval;
4e902c57 755 fi->fib_metrics[type - 1] = nla_get_u32(nla);
1da177e4 756 }
1da177e4
LT
757 }
758 }
1da177e4 759
4e902c57 760 if (cfg->fc_mp) {
1da177e4 761#ifdef CONFIG_IP_ROUTE_MULTIPATH
4e902c57
TG
762 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
763 if (err != 0)
1da177e4 764 goto failure;
4e902c57 765 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
1da177e4 766 goto err_inval;
4e902c57 767 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
1da177e4
LT
768 goto err_inval;
769#ifdef CONFIG_NET_CLS_ROUTE
4e902c57 770 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
1da177e4
LT
771 goto err_inval;
772#endif
773#else
774 goto err_inval;
775#endif
776 } else {
777 struct fib_nh *nh = fi->fib_nh;
4e902c57
TG
778
779 nh->nh_oif = cfg->fc_oif;
780 nh->nh_gw = cfg->fc_gw;
781 nh->nh_flags = cfg->fc_flags;
1da177e4 782#ifdef CONFIG_NET_CLS_ROUTE
4e902c57 783 nh->nh_tclassid = cfg->fc_flow;
1da177e4 784#endif
1da177e4
LT
785#ifdef CONFIG_IP_ROUTE_MULTIPATH
786 nh->nh_weight = 1;
787#endif
788 }
789
4e902c57
TG
790 if (fib_props[cfg->fc_type].error) {
791 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
1da177e4
LT
792 goto err_inval;
793 goto link_it;
794 }
795
4e902c57 796 if (cfg->fc_scope > RT_SCOPE_HOST)
1da177e4
LT
797 goto err_inval;
798
4e902c57 799 if (cfg->fc_scope == RT_SCOPE_HOST) {
1da177e4
LT
800 struct fib_nh *nh = fi->fib_nh;
801
802 /* Local address is added. */
803 if (nhs != 1 || nh->nh_gw)
804 goto err_inval;
805 nh->nh_scope = RT_SCOPE_NOWHERE;
7462bd74 806 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
1da177e4
LT
807 err = -ENODEV;
808 if (nh->nh_dev == NULL)
809 goto failure;
810 } else {
811 change_nexthops(fi) {
4e902c57 812 if ((err = fib_check_nh(cfg, fi, nh)) != 0)
1da177e4
LT
813 goto failure;
814 } endfor_nexthops(fi)
815 }
816
817 if (fi->fib_prefsrc) {
4e902c57
TG
818 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
819 fi->fib_prefsrc != cfg->fc_dst)
7462bd74 820 if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
1da177e4
LT
821 goto err_inval;
822 }
823
824link_it:
825 if ((ofi = fib_find_info(fi)) != NULL) {
826 fi->fib_dead = 1;
827 free_fib_info(fi);
828 ofi->fib_treeref++;
829 return ofi;
830 }
831
832 fi->fib_treeref++;
833 atomic_inc(&fi->fib_clntref);
832b4c5e 834 spin_lock_bh(&fib_info_lock);
1da177e4
LT
835 hlist_add_head(&fi->fib_hash,
836 &fib_info_hash[fib_info_hashfn(fi)]);
837 if (fi->fib_prefsrc) {
838 struct hlist_head *head;
839
840 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
841 hlist_add_head(&fi->fib_lhash, head);
842 }
843 change_nexthops(fi) {
844 struct hlist_head *head;
845 unsigned int hash;
846
847 if (!nh->nh_dev)
848 continue;
849 hash = fib_devindex_hashfn(nh->nh_dev->ifindex);
850 head = &fib_info_devhash[hash];
851 hlist_add_head(&nh->nh_hash, head);
852 } endfor_nexthops(fi)
832b4c5e 853 spin_unlock_bh(&fib_info_lock);
1da177e4
LT
854 return fi;
855
856err_inval:
857 err = -EINVAL;
858
859failure:
e905a9ed 860 if (fi) {
1da177e4
LT
861 fi->fib_dead = 1;
862 free_fib_info(fi);
863 }
4e902c57
TG
864
865 return ERR_PTR(err);
1da177e4
LT
866}
867
e5b43760 868/* Note! fib_semantic_match intentionally uses RCU list functions. */
1da177e4 869int fib_semantic_match(struct list_head *head, const struct flowi *flp,
1ef1b8c8 870 struct fib_result *res, __be32 zone, __be32 mask,
1da177e4
LT
871 int prefixlen)
872{
873 struct fib_alias *fa;
874 int nh_sel = 0;
875
e5b43760 876 list_for_each_entry_rcu(fa, head, fa_list) {
1da177e4
LT
877 int err;
878
879 if (fa->fa_tos &&
880 fa->fa_tos != flp->fl4_tos)
881 continue;
882
883 if (fa->fa_scope < flp->fl4_scope)
884 continue;
885
886 fa->fa_state |= FA_S_ACCESSED;
887
888 err = fib_props[fa->fa_type].error;
889 if (err == 0) {
890 struct fib_info *fi = fa->fa_info;
891
892 if (fi->fib_flags & RTNH_F_DEAD)
893 continue;
894
895 switch (fa->fa_type) {
896 case RTN_UNICAST:
897 case RTN_LOCAL:
898 case RTN_BROADCAST:
899 case RTN_ANYCAST:
900 case RTN_MULTICAST:
901 for_nexthops(fi) {
902 if (nh->nh_flags&RTNH_F_DEAD)
903 continue;
904 if (!flp->oif || flp->oif == nh->nh_oif)
905 break;
906 }
907#ifdef CONFIG_IP_ROUTE_MULTIPATH
908 if (nhsel < fi->fib_nhs) {
909 nh_sel = nhsel;
910 goto out_fill_res;
911 }
912#else
913 if (nhsel < 1) {
914 goto out_fill_res;
915 }
916#endif
917 endfor_nexthops(fi);
918 continue;
919
920 default:
a6db9010
SH
921 printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
922 fa->fa_type);
1da177e4 923 return -EINVAL;
3ff50b79 924 }
1da177e4
LT
925 }
926 return err;
927 }
928 return 1;
929
930out_fill_res:
931 res->prefixlen = prefixlen;
932 res->nh_sel = nh_sel;
933 res->type = fa->fa_type;
934 res->scope = fa->fa_scope;
935 res->fi = fa->fa_info;
1da177e4
LT
936 atomic_inc(&res->fi->fib_clntref);
937 return 0;
938}
939
940/* Find appropriate source address to this destination */
941
b83738ae 942__be32 __fib_res_prefsrc(struct fib_result *res)
1da177e4
LT
943{
944 return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
945}
946
be403ea1 947int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
81f7bf6c 948 u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
be403ea1 949 struct fib_info *fi, unsigned int flags)
1da177e4 950{
be403ea1 951 struct nlmsghdr *nlh;
1da177e4 952 struct rtmsg *rtm;
1da177e4 953
be403ea1
TG
954 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
955 if (nlh == NULL)
26932566 956 return -EMSGSIZE;
be403ea1
TG
957
958 rtm = nlmsg_data(nlh);
1da177e4
LT
959 rtm->rtm_family = AF_INET;
960 rtm->rtm_dst_len = dst_len;
961 rtm->rtm_src_len = 0;
962 rtm->rtm_tos = tos;
963 rtm->rtm_table = tb_id;
be403ea1 964 NLA_PUT_U32(skb, RTA_TABLE, tb_id);
1da177e4
LT
965 rtm->rtm_type = type;
966 rtm->rtm_flags = fi->fib_flags;
967 rtm->rtm_scope = scope;
1da177e4 968 rtm->rtm_protocol = fi->fib_protocol;
be403ea1
TG
969
970 if (rtm->rtm_dst_len)
17fb2c64 971 NLA_PUT_BE32(skb, RTA_DST, dst);
be403ea1 972
1da177e4 973 if (fi->fib_priority)
be403ea1
TG
974 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
975
1da177e4 976 if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
be403ea1
TG
977 goto nla_put_failure;
978
1da177e4 979 if (fi->fib_prefsrc)
17fb2c64 980 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
be403ea1 981
1da177e4
LT
982 if (fi->fib_nhs == 1) {
983 if (fi->fib_nh->nh_gw)
17fb2c64 984 NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
be403ea1 985
1da177e4 986 if (fi->fib_nh->nh_oif)
be403ea1 987 NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
8265abc0
PM
988#ifdef CONFIG_NET_CLS_ROUTE
989 if (fi->fib_nh[0].nh_tclassid)
be403ea1 990 NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
8265abc0 991#endif
1da177e4
LT
992 }
993#ifdef CONFIG_IP_ROUTE_MULTIPATH
994 if (fi->fib_nhs > 1) {
be403ea1
TG
995 struct rtnexthop *rtnh;
996 struct nlattr *mp;
997
998 mp = nla_nest_start(skb, RTA_MULTIPATH);
999 if (mp == NULL)
1000 goto nla_put_failure;
1da177e4
LT
1001
1002 for_nexthops(fi) {
be403ea1
TG
1003 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1004 if (rtnh == NULL)
1005 goto nla_put_failure;
1006
1007 rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1008 rtnh->rtnh_hops = nh->nh_weight - 1;
1009 rtnh->rtnh_ifindex = nh->nh_oif;
1010
1da177e4 1011 if (nh->nh_gw)
17fb2c64 1012 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
8265abc0
PM
1013#ifdef CONFIG_NET_CLS_ROUTE
1014 if (nh->nh_tclassid)
be403ea1 1015 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
8265abc0 1016#endif
be403ea1
TG
1017 /* length of rtnetlink header + attributes */
1018 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1da177e4 1019 } endfor_nexthops(fi);
be403ea1
TG
1020
1021 nla_nest_end(skb, mp);
1da177e4
LT
1022 }
1023#endif
be403ea1 1024 return nlmsg_end(skb, nlh);
1da177e4 1025
be403ea1 1026nla_put_failure:
26932566
PM
1027 nlmsg_cancel(skb, nlh);
1028 return -EMSGSIZE;
1da177e4
LT
1029}
1030
1da177e4
LT
1031/*
1032 Update FIB if:
1033 - local address disappeared -> we must delete all the entries
1034 referring to it.
1035 - device went down -> we must shutdown all nexthops going via it.
1036 */
4814bdbd 1037int fib_sync_down_addr(struct net *net, __be32 local)
1da177e4
LT
1038{
1039 int ret = 0;
85326fa5
DL
1040 unsigned int hash = fib_laddr_hashfn(local);
1041 struct hlist_head *head = &fib_info_laddrhash[hash];
1042 struct hlist_node *node;
1043 struct fib_info *fi;
1da177e4 1044
85326fa5
DL
1045 if (fib_info_laddrhash == NULL || local == 0)
1046 return 0;
1da177e4 1047
85326fa5 1048 hlist_for_each_entry(fi, node, head, fib_lhash) {
4814bdbd
DL
1049 if (fi->fib_net != net)
1050 continue;
85326fa5
DL
1051 if (fi->fib_prefsrc == local) {
1052 fi->fib_flags |= RTNH_F_DEAD;
1053 ret++;
1da177e4
LT
1054 }
1055 }
85326fa5
DL
1056 return ret;
1057}
1058
1059int fib_sync_down_dev(struct net_device *dev, int force)
1060{
1061 int ret = 0;
1062 int scope = RT_SCOPE_NOWHERE;
1063 struct fib_info *prev_fi = NULL;
1064 unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1065 struct hlist_head *head = &fib_info_devhash[hash];
1066 struct hlist_node *node;
1067 struct fib_nh *nh;
1da177e4 1068
85326fa5
DL
1069 if (force)
1070 scope = -1;
1da177e4 1071
85326fa5
DL
1072 hlist_for_each_entry(nh, node, head, nh_hash) {
1073 struct fib_info *fi = nh->nh_parent;
1074 int dead;
1da177e4 1075
85326fa5
DL
1076 BUG_ON(!fi->fib_nhs);
1077 if (nh->nh_dev != dev || fi == prev_fi)
1078 continue;
1079 prev_fi = fi;
1080 dead = 0;
1081 change_nexthops(fi) {
1082 if (nh->nh_flags&RTNH_F_DEAD)
1083 dead++;
1084 else if (nh->nh_dev == dev &&
1085 nh->nh_scope != scope) {
1086 nh->nh_flags |= RTNH_F_DEAD;
1da177e4 1087#ifdef CONFIG_IP_ROUTE_MULTIPATH
85326fa5
DL
1088 spin_lock_bh(&fib_multipath_lock);
1089 fi->fib_power -= nh->nh_power;
1090 nh->nh_power = 0;
1091 spin_unlock_bh(&fib_multipath_lock);
1da177e4 1092#endif
85326fa5
DL
1093 dead++;
1094 }
1da177e4 1095#ifdef CONFIG_IP_ROUTE_MULTIPATH
85326fa5
DL
1096 if (force > 1 && nh->nh_dev == dev) {
1097 dead = fi->fib_nhs;
1098 break;
1da177e4 1099 }
85326fa5
DL
1100#endif
1101 } endfor_nexthops(fi)
1102 if (dead == fi->fib_nhs) {
1103 fi->fib_flags |= RTNH_F_DEAD;
1104 ret++;
1da177e4
LT
1105 }
1106 }
1107
1108 return ret;
1109}
1110
1111#ifdef CONFIG_IP_ROUTE_MULTIPATH
1112
1113/*
1114 Dead device goes up. We wake up dead nexthops.
1115 It takes sense only on multipath routes.
1116 */
1117
1118int fib_sync_up(struct net_device *dev)
1119{
1120 struct fib_info *prev_fi;
1121 unsigned int hash;
1122 struct hlist_head *head;
1123 struct hlist_node *node;
1124 struct fib_nh *nh;
1125 int ret;
1126
1127 if (!(dev->flags&IFF_UP))
1128 return 0;
1129
1130 prev_fi = NULL;
1131 hash = fib_devindex_hashfn(dev->ifindex);
1132 head = &fib_info_devhash[hash];
1133 ret = 0;
1134
1135 hlist_for_each_entry(nh, node, head, nh_hash) {
1136 struct fib_info *fi = nh->nh_parent;
1137 int alive;
1138
1139 BUG_ON(!fi->fib_nhs);
1140 if (nh->nh_dev != dev || fi == prev_fi)
1141 continue;
1142
1143 prev_fi = fi;
1144 alive = 0;
1145 change_nexthops(fi) {
1146 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1147 alive++;
1148 continue;
1149 }
1150 if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP))
1151 continue;
e5ed6399 1152 if (nh->nh_dev != dev || !__in_dev_get_rtnl(dev))
1da177e4
LT
1153 continue;
1154 alive++;
1155 spin_lock_bh(&fib_multipath_lock);
1156 nh->nh_power = 0;
1157 nh->nh_flags &= ~RTNH_F_DEAD;
1158 spin_unlock_bh(&fib_multipath_lock);
1159 } endfor_nexthops(fi)
1160
1161 if (alive > 0) {
1162 fi->fib_flags &= ~RTNH_F_DEAD;
1163 ret++;
1164 }
1165 }
1166
1167 return ret;
1168}
1169
1170/*
1171 The algorithm is suboptimal, but it provides really
1172 fair weighted route distribution.
1173 */
1174
1175void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1176{
1177 struct fib_info *fi = res->fi;
1178 int w;
1179
1180 spin_lock_bh(&fib_multipath_lock);
1181 if (fi->fib_power <= 0) {
1182 int power = 0;
1183 change_nexthops(fi) {
1184 if (!(nh->nh_flags&RTNH_F_DEAD)) {
1185 power += nh->nh_weight;
1186 nh->nh_power = nh->nh_weight;
1187 }
1188 } endfor_nexthops(fi);
1189 fi->fib_power = power;
1190 if (power <= 0) {
1191 spin_unlock_bh(&fib_multipath_lock);
1192 /* Race condition: route has just become dead. */
1193 res->nh_sel = 0;
1194 return;
1195 }
1196 }
1197
1198
1199 /* w should be random number [0..fi->fib_power-1],
1200 it is pretty bad approximation.
1201 */
1202
1203 w = jiffies % fi->fib_power;
1204
1205 change_nexthops(fi) {
1206 if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) {
1207 if ((w -= nh->nh_power) <= 0) {
1208 nh->nh_power--;
1209 fi->fib_power--;
1210 res->nh_sel = nhsel;
1211 spin_unlock_bh(&fib_multipath_lock);
1212 return;
1213 }
1214 }
1215 } endfor_nexthops(fi);
1216
1217 /* Race condition: route has just become dead. */
1218 res->nh_sel = 0;
1219 spin_unlock_bh(&fib_multipath_lock);
1220}
1221#endif