]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/ip_gre.c
ipip: Optimize multiple unregistration
[net-next-2.6.git] / net / ipv4 / ip_gre.c
CommitLineData
1da177e4 1/*
e905a9ed 2 * Linux NET3: GRE over IP protocol decoder.
1da177e4
LT
3 *
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
4fc268d2 13#include <linux/capability.h>
1da177e4
LT
14#include <linux/module.h>
15#include <linux/types.h>
1da177e4
LT
16#include <linux/kernel.h>
17#include <asm/uaccess.h>
18#include <linux/skbuff.h>
19#include <linux/netdevice.h>
20#include <linux/in.h>
21#include <linux/tcp.h>
22#include <linux/udp.h>
23#include <linux/if_arp.h>
24#include <linux/mroute.h>
25#include <linux/init.h>
26#include <linux/in6.h>
27#include <linux/inetdevice.h>
28#include <linux/igmp.h>
29#include <linux/netfilter_ipv4.h>
e1a80002 30#include <linux/etherdevice.h>
46f25dff 31#include <linux/if_ether.h>
1da177e4
LT
32
33#include <net/sock.h>
34#include <net/ip.h>
35#include <net/icmp.h>
36#include <net/protocol.h>
37#include <net/ipip.h>
38#include <net/arp.h>
39#include <net/checksum.h>
40#include <net/dsfield.h>
41#include <net/inet_ecn.h>
42#include <net/xfrm.h>
59a4c759
PE
43#include <net/net_namespace.h>
44#include <net/netns/generic.h>
c19e654d 45#include <net/rtnetlink.h>
1da177e4
LT
46
47#ifdef CONFIG_IPV6
48#include <net/ipv6.h>
49#include <net/ip6_fib.h>
50#include <net/ip6_route.h>
51#endif
52
53/*
54 Problems & solutions
55 --------------------
56
57 1. The most important issue is detecting local dead loops.
58 They would cause complete host lockup in transmit, which
59 would be "resolved" by stack overflow or, if queueing is enabled,
60 with infinite looping in net_bh.
61
62 We cannot track such dead loops during route installation,
63 it is infeasible task. The most general solutions would be
64 to keep skb->encapsulation counter (sort of local ttl),
65 and silently drop packet when it expires. It is the best
66 solution, but it supposes maintaing new variable in ALL
67 skb, even if no tunneling is used.
68
a43912ab 69 Current solution: HARD_TX_LOCK lock breaks dead loops.
1da177e4
LT
70
71
72
73 2. Networking dead loops would not kill routers, but would really
74 kill network. IP hop limit plays role of "t->recursion" in this case,
75 if we copy it from packet being encapsulated to upper header.
76 It is very good solution, but it introduces two problems:
77
78 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79 do not work over tunnels.
80 - traceroute does not work. I planned to relay ICMP from tunnel,
81 so that this problem would be solved and traceroute output
82 would even more informative. This idea appeared to be wrong:
83 only Linux complies to rfc1812 now (yes, guys, Linux is the only
84 true router now :-)), all routers (at least, in neighbourhood of mine)
85 return only 8 bytes of payload. It is the end.
86
87 Hence, if we want that OSPF worked or traceroute said something reasonable,
88 we should search for another solution.
89
90 One of them is to parse packet trying to detect inner encapsulation
91 made by our node. It is difficult or even impossible, especially,
92 taking into account fragmentation. TO be short, tt is not solution at all.
93
94 Current solution: The solution was UNEXPECTEDLY SIMPLE.
95 We force DF flag on tunnels with preconfigured hop limit,
96 that is ALL. :-) Well, it does not remove the problem completely,
97 but exponential growth of network traffic is changed to linear
98 (branches, that exceed pmtu are pruned) and tunnel mtu
99 fastly degrades to value <68, where looping stops.
100 Yes, it is not good if there exists a router in the loop,
101 which does not force DF, even when encapsulating packets have DF set.
102 But it is not our problem! Nobody could accuse us, we made
103 all that we could make. Even if it is your gated who injected
104 fatal route to network, even if it were you who configured
105 fatal static route: you are innocent. :-)
106
107
108
109 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
110 practically identical code. It would be good to glue them
111 together, but it is not very evident, how to make them modular.
112 sit is integral part of IPv6, ipip and gre are naturally modular.
113 We could extract common parts (hash table, ioctl etc)
114 to a separate module (ip_tunnel.c).
115
116 Alexey Kuznetsov.
117 */
118
c19e654d 119static struct rtnl_link_ops ipgre_link_ops __read_mostly;
1da177e4
LT
120static int ipgre_tunnel_init(struct net_device *dev);
121static void ipgre_tunnel_setup(struct net_device *dev);
42aa9162 122static int ipgre_tunnel_bind_dev(struct net_device *dev);
1da177e4
LT
123
124/* Fallback tunnel: no source, no destination, no key, no options */
125
eb8ce741
PE
126#define HASH_SIZE 16
127
59a4c759
PE
128static int ipgre_net_id;
129struct ipgre_net {
eb8ce741
PE
130 struct ip_tunnel *tunnels[4][HASH_SIZE];
131
7daa0004 132 struct net_device *fb_tunnel_dev;
59a4c759
PE
133};
134
1da177e4
LT
135/* Tunnel hash table */
136
137/*
138 4 hash tables:
139
140 3: (remote,local)
141 2: (remote,*)
142 1: (*,local)
143 0: (*,*)
144
145 We require exact key match i.e. if a key is present in packet
146 it will match only tunnel with the same key; if it is not present,
147 it will match only keyless tunnel.
148
149 All keysless packets, if not matched configured keyless tunnels
150 will match fallback tunnel.
151 */
152
d5a0a1e3 153#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
1da177e4 154
eb8ce741
PE
155#define tunnels_r_l tunnels[3]
156#define tunnels_r tunnels[2]
157#define tunnels_l tunnels[1]
158#define tunnels_wc tunnels[0]
8d5b2c08
ED
159/*
160 * Locking : hash tables are protected by RCU and a spinlock
161 */
162static DEFINE_SPINLOCK(ipgre_lock);
1da177e4 163
8d5b2c08
ED
164#define for_each_ip_tunnel_rcu(start) \
165 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
1da177e4
LT
166
167/* Given src, dst and key, find appropriate for input tunnel. */
168
749c10f9 169static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
e1a80002
HX
170 __be32 remote, __be32 local,
171 __be32 key, __be16 gre_proto)
1da177e4 172{
749c10f9
TT
173 struct net *net = dev_net(dev);
174 int link = dev->ifindex;
1da177e4
LT
175 unsigned h0 = HASH(remote);
176 unsigned h1 = HASH(key);
afcf1242 177 struct ip_tunnel *t, *cand = NULL;
7daa0004 178 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
e1a80002
HX
179 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
180 ARPHRD_ETHER : ARPHRD_IPGRE;
afcf1242 181 int score, cand_score = 4;
1da177e4 182
8d5b2c08 183 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
749c10f9
TT
184 if (local != t->parms.iph.saddr ||
185 remote != t->parms.iph.daddr ||
186 key != t->parms.i_key ||
187 !(t->dev->flags & IFF_UP))
188 continue;
189
190 if (t->dev->type != ARPHRD_IPGRE &&
191 t->dev->type != dev_type)
192 continue;
193
afcf1242 194 score = 0;
749c10f9 195 if (t->parms.link != link)
afcf1242 196 score |= 1;
749c10f9 197 if (t->dev->type != dev_type)
afcf1242
TT
198 score |= 2;
199 if (score == 0)
749c10f9 200 return t;
afcf1242
TT
201
202 if (score < cand_score) {
203 cand = t;
204 cand_score = score;
205 }
1da177e4 206 }
e1a80002 207
8d5b2c08 208 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
749c10f9
TT
209 if (remote != t->parms.iph.daddr ||
210 key != t->parms.i_key ||
211 !(t->dev->flags & IFF_UP))
212 continue;
213
214 if (t->dev->type != ARPHRD_IPGRE &&
215 t->dev->type != dev_type)
216 continue;
217
afcf1242 218 score = 0;
749c10f9 219 if (t->parms.link != link)
afcf1242 220 score |= 1;
749c10f9 221 if (t->dev->type != dev_type)
afcf1242
TT
222 score |= 2;
223 if (score == 0)
749c10f9 224 return t;
afcf1242
TT
225
226 if (score < cand_score) {
227 cand = t;
228 cand_score = score;
229 }
1da177e4 230 }
e1a80002 231
8d5b2c08 232 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
749c10f9
TT
233 if ((local != t->parms.iph.saddr &&
234 (local != t->parms.iph.daddr ||
235 !ipv4_is_multicast(local))) ||
236 key != t->parms.i_key ||
237 !(t->dev->flags & IFF_UP))
238 continue;
239
240 if (t->dev->type != ARPHRD_IPGRE &&
241 t->dev->type != dev_type)
242 continue;
243
afcf1242 244 score = 0;
749c10f9 245 if (t->parms.link != link)
afcf1242 246 score |= 1;
749c10f9 247 if (t->dev->type != dev_type)
afcf1242
TT
248 score |= 2;
249 if (score == 0)
749c10f9 250 return t;
afcf1242
TT
251
252 if (score < cand_score) {
253 cand = t;
254 cand_score = score;
255 }
1da177e4 256 }
e1a80002 257
8d5b2c08 258 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
749c10f9
TT
259 if (t->parms.i_key != key ||
260 !(t->dev->flags & IFF_UP))
261 continue;
262
263 if (t->dev->type != ARPHRD_IPGRE &&
264 t->dev->type != dev_type)
265 continue;
266
afcf1242 267 score = 0;
749c10f9 268 if (t->parms.link != link)
afcf1242 269 score |= 1;
749c10f9 270 if (t->dev->type != dev_type)
afcf1242
TT
271 score |= 2;
272 if (score == 0)
749c10f9 273 return t;
afcf1242
TT
274
275 if (score < cand_score) {
276 cand = t;
277 cand_score = score;
278 }
1da177e4
LT
279 }
280
afcf1242
TT
281 if (cand != NULL)
282 return cand;
e1a80002 283
8d5b2c08
ED
284 dev = ign->fb_tunnel_dev;
285 if (dev->flags & IFF_UP)
286 return netdev_priv(dev);
749c10f9 287
1da177e4
LT
288 return NULL;
289}
290
f57e7d5a
PE
291static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
292 struct ip_tunnel_parm *parms)
1da177e4 293{
5056a1ef
YH
294 __be32 remote = parms->iph.daddr;
295 __be32 local = parms->iph.saddr;
296 __be32 key = parms->i_key;
1da177e4
LT
297 unsigned h = HASH(key);
298 int prio = 0;
299
300 if (local)
301 prio |= 1;
f97c1e0c 302 if (remote && !ipv4_is_multicast(remote)) {
1da177e4
LT
303 prio |= 2;
304 h ^= HASH(remote);
305 }
306
eb8ce741 307 return &ign->tunnels[prio][h];
1da177e4
LT
308}
309
f57e7d5a
PE
310static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
311 struct ip_tunnel *t)
5056a1ef 312{
f57e7d5a 313 return __ipgre_bucket(ign, &t->parms);
5056a1ef
YH
314}
315
f57e7d5a 316static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
1da177e4 317{
f57e7d5a 318 struct ip_tunnel **tp = ipgre_bucket(ign, t);
1da177e4 319
8d5b2c08 320 spin_lock_bh(&ipgre_lock);
1da177e4 321 t->next = *tp;
8d5b2c08
ED
322 rcu_assign_pointer(*tp, t);
323 spin_unlock_bh(&ipgre_lock);
1da177e4
LT
324}
325
f57e7d5a 326static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
1da177e4
LT
327{
328 struct ip_tunnel **tp;
329
f57e7d5a 330 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
1da177e4 331 if (t == *tp) {
8d5b2c08 332 spin_lock_bh(&ipgre_lock);
1da177e4 333 *tp = t->next;
8d5b2c08 334 spin_unlock_bh(&ipgre_lock);
1da177e4
LT
335 break;
336 }
337 }
338}
339
e1a80002
HX
340static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
341 struct ip_tunnel_parm *parms,
342 int type)
1da177e4 343{
d5a0a1e3
AV
344 __be32 remote = parms->iph.daddr;
345 __be32 local = parms->iph.saddr;
346 __be32 key = parms->i_key;
749c10f9 347 int link = parms->link;
e1a80002
HX
348 struct ip_tunnel *t, **tp;
349 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
350
351 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
352 if (local == t->parms.iph.saddr &&
353 remote == t->parms.iph.daddr &&
354 key == t->parms.i_key &&
749c10f9 355 link == t->parms.link &&
e1a80002
HX
356 type == t->dev->type)
357 break;
358
359 return t;
360}
361
362static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
363 struct ip_tunnel_parm *parms, int create)
364{
365 struct ip_tunnel *t, *nt;
1da177e4 366 struct net_device *dev;
1da177e4 367 char name[IFNAMSIZ];
f57e7d5a 368 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1da177e4 369
e1a80002
HX
370 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
371 if (t || !create)
372 return t;
1da177e4
LT
373
374 if (parms->name[0])
375 strlcpy(name, parms->name, IFNAMSIZ);
34cc7ba6
PE
376 else
377 sprintf(name, "gre%%d");
1da177e4
LT
378
379 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
380 if (!dev)
381 return NULL;
382
0b67eceb
PE
383 dev_net_set(dev, net);
384
b37d428b
PE
385 if (strchr(name, '%')) {
386 if (dev_alloc_name(dev, name) < 0)
387 goto failed_free;
388 }
389
2941a486 390 nt = netdev_priv(dev);
1da177e4 391 nt->parms = *parms;
c19e654d 392 dev->rtnl_link_ops = &ipgre_link_ops;
1da177e4 393
42aa9162
HX
394 dev->mtu = ipgre_tunnel_bind_dev(dev);
395
b37d428b
PE
396 if (register_netdevice(dev) < 0)
397 goto failed_free;
1da177e4 398
1da177e4 399 dev_hold(dev);
f57e7d5a 400 ipgre_tunnel_link(ign, nt);
1da177e4
LT
401 return nt;
402
b37d428b
PE
403failed_free:
404 free_netdev(dev);
1da177e4
LT
405 return NULL;
406}
407
408static void ipgre_tunnel_uninit(struct net_device *dev)
409{
f57e7d5a
PE
410 struct net *net = dev_net(dev);
411 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
412
413 ipgre_tunnel_unlink(ign, netdev_priv(dev));
1da177e4
LT
414 dev_put(dev);
415}
416
417
418static void ipgre_err(struct sk_buff *skb, u32 info)
419{
1da177e4 420
071f92d0 421/* All the routers (except for Linux) return only
1da177e4
LT
422 8 bytes of packet payload. It means, that precise relaying of
423 ICMP in the real Internet is absolutely infeasible.
424
425 Moreover, Cisco "wise men" put GRE key to the third word
426 in GRE header. It makes impossible maintaining even soft state for keyed
427 GRE tunnels with enabled checksum. Tell them "thank you".
428
429 Well, I wonder, rfc1812 was written by Cisco employee,
430 what the hell these idiots break standrads established
431 by themself???
432 */
433
6ed2533e 434 struct iphdr *iph = (struct iphdr *)skb->data;
d5a0a1e3 435 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
1da177e4 436 int grehlen = (iph->ihl<<2) + 4;
88c7664f
ACM
437 const int type = icmp_hdr(skb)->type;
438 const int code = icmp_hdr(skb)->code;
1da177e4 439 struct ip_tunnel *t;
d5a0a1e3 440 __be16 flags;
1da177e4
LT
441
442 flags = p[0];
443 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
444 if (flags&(GRE_VERSION|GRE_ROUTING))
445 return;
446 if (flags&GRE_KEY) {
447 grehlen += 4;
448 if (flags&GRE_CSUM)
449 grehlen += 4;
450 }
451 }
452
453 /* If only 8 bytes returned, keyed message will be dropped here */
454 if (skb_headlen(skb) < grehlen)
455 return;
456
457 switch (type) {
458 default:
459 case ICMP_PARAMETERPROB:
460 return;
461
462 case ICMP_DEST_UNREACH:
463 switch (code) {
464 case ICMP_SR_FAILED:
465 case ICMP_PORT_UNREACH:
466 /* Impossible event. */
467 return;
468 case ICMP_FRAG_NEEDED:
469 /* Soft state for pmtu is maintained by IP core. */
470 return;
471 default:
472 /* All others are translated to HOST_UNREACH.
473 rfc2003 contains "deep thoughts" about NET_UNREACH,
474 I believe they are just ether pollution. --ANK
475 */
476 break;
477 }
478 break;
479 case ICMP_TIME_EXCEEDED:
480 if (code != ICMP_EXC_TTL)
481 return;
482 break;
483 }
484
8d5b2c08 485 rcu_read_lock();
749c10f9 486 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
e1a80002
HX
487 flags & GRE_KEY ?
488 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
489 p[1]);
f97c1e0c
JP
490 if (t == NULL || t->parms.iph.daddr == 0 ||
491 ipv4_is_multicast(t->parms.iph.daddr))
1da177e4
LT
492 goto out;
493
494 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
495 goto out;
496
da6185d8 497 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
1da177e4
LT
498 t->err_count++;
499 else
500 t->err_count = 1;
501 t->err_time = jiffies;
502out:
8d5b2c08 503 rcu_read_unlock();
1da177e4 504 return;
1da177e4
LT
505}
506
507static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
508{
509 if (INET_ECN_is_ce(iph->tos)) {
510 if (skb->protocol == htons(ETH_P_IP)) {
eddc9ec5 511 IP_ECN_set_ce(ip_hdr(skb));
1da177e4 512 } else if (skb->protocol == htons(ETH_P_IPV6)) {
0660e03f 513 IP6_ECN_set_ce(ipv6_hdr(skb));
1da177e4
LT
514 }
515 }
516}
517
518static inline u8
519ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
520{
521 u8 inner = 0;
522 if (skb->protocol == htons(ETH_P_IP))
523 inner = old_iph->tos;
524 else if (skb->protocol == htons(ETH_P_IPV6))
525 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
526 return INET_ECN_encapsulate(tos, inner);
527}
528
529static int ipgre_rcv(struct sk_buff *skb)
530{
531 struct iphdr *iph;
532 u8 *h;
d5a0a1e3 533 __be16 flags;
d3bc23e7 534 __sum16 csum = 0;
d5a0a1e3 535 __be32 key = 0;
1da177e4
LT
536 u32 seqno = 0;
537 struct ip_tunnel *tunnel;
538 int offset = 4;
e1a80002 539 __be16 gre_proto;
64194c31 540 unsigned int len;
1da177e4
LT
541
542 if (!pskb_may_pull(skb, 16))
543 goto drop_nolock;
544
eddc9ec5 545 iph = ip_hdr(skb);
1da177e4 546 h = skb->data;
d5a0a1e3 547 flags = *(__be16*)h;
1da177e4
LT
548
549 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
550 /* - Version must be 0.
551 - We do not support routing headers.
552 */
553 if (flags&(GRE_VERSION|GRE_ROUTING))
554 goto drop_nolock;
555
556 if (flags&GRE_CSUM) {
fb286bb2 557 switch (skb->ip_summed) {
84fa7933 558 case CHECKSUM_COMPLETE:
d3bc23e7 559 csum = csum_fold(skb->csum);
fb286bb2
HX
560 if (!csum)
561 break;
562 /* fall through */
563 case CHECKSUM_NONE:
564 skb->csum = 0;
565 csum = __skb_checksum_complete(skb);
84fa7933 566 skb->ip_summed = CHECKSUM_COMPLETE;
1da177e4
LT
567 }
568 offset += 4;
569 }
570 if (flags&GRE_KEY) {
d5a0a1e3 571 key = *(__be32*)(h + offset);
1da177e4
LT
572 offset += 4;
573 }
574 if (flags&GRE_SEQ) {
d5a0a1e3 575 seqno = ntohl(*(__be32*)(h + offset));
1da177e4
LT
576 offset += 4;
577 }
578 }
579
e1a80002
HX
580 gre_proto = *(__be16 *)(h + 2);
581
8d5b2c08 582 rcu_read_lock();
749c10f9 583 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
e1a80002
HX
584 iph->saddr, iph->daddr, key,
585 gre_proto))) {
addd68eb
PE
586 struct net_device_stats *stats = &tunnel->dev->stats;
587
1da177e4
LT
588 secpath_reset(skb);
589
e1a80002 590 skb->protocol = gre_proto;
1da177e4
LT
591 /* WCCP version 1 and 2 protocol decoding.
592 * - Change protocol to IP
593 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
594 */
e1a80002 595 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
496c98df 596 skb->protocol = htons(ETH_P_IP);
e905a9ed 597 if ((*(h + offset) & 0xF0) != 0x40)
1da177e4
LT
598 offset += 4;
599 }
600
1d069167 601 skb->mac_header = skb->network_header;
4209fb60 602 __pskb_pull(skb, offset);
9c70220b 603 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
1da177e4
LT
604 skb->pkt_type = PACKET_HOST;
605#ifdef CONFIG_NET_IPGRE_BROADCAST
f97c1e0c 606 if (ipv4_is_multicast(iph->daddr)) {
1da177e4 607 /* Looped back packet, drop it! */
511c3f92 608 if (skb_rtable(skb)->fl.iif == 0)
1da177e4 609 goto drop;
addd68eb 610 stats->multicast++;
1da177e4
LT
611 skb->pkt_type = PACKET_BROADCAST;
612 }
613#endif
614
615 if (((flags&GRE_CSUM) && csum) ||
616 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
addd68eb
PE
617 stats->rx_crc_errors++;
618 stats->rx_errors++;
1da177e4
LT
619 goto drop;
620 }
621 if (tunnel->parms.i_flags&GRE_SEQ) {
622 if (!(flags&GRE_SEQ) ||
623 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
addd68eb
PE
624 stats->rx_fifo_errors++;
625 stats->rx_errors++;
1da177e4
LT
626 goto drop;
627 }
628 tunnel->i_seqno = seqno + 1;
629 }
e1a80002 630
64194c31
HX
631 len = skb->len;
632
e1a80002
HX
633 /* Warning: All skb pointers will be invalidated! */
634 if (tunnel->dev->type == ARPHRD_ETHER) {
635 if (!pskb_may_pull(skb, ETH_HLEN)) {
636 stats->rx_length_errors++;
637 stats->rx_errors++;
638 goto drop;
639 }
640
641 iph = ip_hdr(skb);
642 skb->protocol = eth_type_trans(skb, tunnel->dev);
643 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
644 }
645
addd68eb 646 stats->rx_packets++;
64194c31 647 stats->rx_bytes += len;
1da177e4 648 skb->dev = tunnel->dev;
adf30907 649 skb_dst_drop(skb);
1da177e4 650 nf_reset(skb);
e1a80002
HX
651
652 skb_reset_network_header(skb);
1da177e4 653 ipgre_ecn_decapsulate(iph, skb);
e1a80002 654
1da177e4 655 netif_rx(skb);
8d5b2c08 656 rcu_read_unlock();
1da177e4
LT
657 return(0);
658 }
45af08be 659 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
1da177e4
LT
660
661drop:
8d5b2c08 662 rcu_read_unlock();
1da177e4
LT
663drop_nolock:
664 kfree_skb(skb);
665 return(0);
666}
667
6fef4c0c 668static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
1da177e4 669{
2941a486 670 struct ip_tunnel *tunnel = netdev_priv(dev);
0bfbedb1
ED
671 struct net_device_stats *stats = &dev->stats;
672 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
eddc9ec5 673 struct iphdr *old_iph = ip_hdr(skb);
1da177e4
LT
674 struct iphdr *tiph;
675 u8 tos;
d5a0a1e3 676 __be16 df;
1da177e4
LT
677 struct rtable *rt; /* Route to the other host */
678 struct net_device *tdev; /* Device to other host */
679 struct iphdr *iph; /* Our new IP header */
c2636b4d 680 unsigned int max_headroom; /* The extra header space needed */
1da177e4 681 int gre_hlen;
d5a0a1e3 682 __be32 dst;
1da177e4
LT
683 int mtu;
684
e1a80002
HX
685 if (dev->type == ARPHRD_ETHER)
686 IPCB(skb)->flags = 0;
687
688 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
1da177e4 689 gre_hlen = 0;
6ed2533e 690 tiph = (struct iphdr *)skb->data;
1da177e4
LT
691 } else {
692 gre_hlen = tunnel->hlen;
693 tiph = &tunnel->parms.iph;
694 }
695
696 if ((dst = tiph->daddr) == 0) {
697 /* NBMA tunnel */
698
adf30907 699 if (skb_dst(skb) == NULL) {
addd68eb 700 stats->tx_fifo_errors++;
1da177e4
LT
701 goto tx_error;
702 }
703
704 if (skb->protocol == htons(ETH_P_IP)) {
511c3f92 705 rt = skb_rtable(skb);
1da177e4
LT
706 if ((dst = rt->rt_gateway) == 0)
707 goto tx_error_icmp;
708 }
709#ifdef CONFIG_IPV6
710 else if (skb->protocol == htons(ETH_P_IPV6)) {
711 struct in6_addr *addr6;
712 int addr_type;
adf30907 713 struct neighbour *neigh = skb_dst(skb)->neighbour;
1da177e4
LT
714
715 if (neigh == NULL)
716 goto tx_error;
717
6ed2533e 718 addr6 = (struct in6_addr *)&neigh->primary_key;
1da177e4
LT
719 addr_type = ipv6_addr_type(addr6);
720
721 if (addr_type == IPV6_ADDR_ANY) {
0660e03f 722 addr6 = &ipv6_hdr(skb)->daddr;
1da177e4
LT
723 addr_type = ipv6_addr_type(addr6);
724 }
725
726 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
727 goto tx_error_icmp;
728
729 dst = addr6->s6_addr32[3];
730 }
731#endif
732 else
733 goto tx_error;
734 }
735
736 tos = tiph->tos;
ee686ca9
AJ
737 if (tos == 1) {
738 tos = 0;
1da177e4
LT
739 if (skb->protocol == htons(ETH_P_IP))
740 tos = old_iph->tos;
1da177e4
LT
741 }
742
743 {
744 struct flowi fl = { .oif = tunnel->parms.link,
745 .nl_u = { .ip4_u =
746 { .daddr = dst,
747 .saddr = tiph->saddr,
748 .tos = RT_TOS(tos) } },
749 .proto = IPPROTO_GRE };
96635522 750 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
addd68eb 751 stats->tx_carrier_errors++;
1da177e4
LT
752 goto tx_error;
753 }
754 }
755 tdev = rt->u.dst.dev;
756
757 if (tdev == dev) {
758 ip_rt_put(rt);
addd68eb 759 stats->collisions++;
1da177e4
LT
760 goto tx_error;
761 }
762
763 df = tiph->frag_off;
764 if (df)
c95b819a 765 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
1da177e4 766 else
adf30907 767 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
1da177e4 768
adf30907
ED
769 if (skb_dst(skb))
770 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
1da177e4
LT
771
772 if (skb->protocol == htons(ETH_P_IP)) {
773 df |= (old_iph->frag_off&htons(IP_DF));
774
775 if ((old_iph->frag_off&htons(IP_DF)) &&
776 mtu < ntohs(old_iph->tot_len)) {
777 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
778 ip_rt_put(rt);
779 goto tx_error;
780 }
781 }
782#ifdef CONFIG_IPV6
783 else if (skb->protocol == htons(ETH_P_IPV6)) {
adf30907 784 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
1da177e4 785
adf30907 786 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
f97c1e0c
JP
787 if ((tunnel->parms.iph.daddr &&
788 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
1da177e4
LT
789 rt6->rt6i_dst.plen == 128) {
790 rt6->rt6i_flags |= RTF_MODIFIED;
adf30907 791 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
1da177e4
LT
792 }
793 }
794
795 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
796 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
797 ip_rt_put(rt);
798 goto tx_error;
799 }
800 }
801#endif
802
803 if (tunnel->err_count > 0) {
da6185d8
WY
804 if (time_before(jiffies,
805 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
1da177e4
LT
806 tunnel->err_count--;
807
808 dst_link_failure(skb);
809 } else
810 tunnel->err_count = 0;
811 }
812
813 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
814
cfbba49d
PM
815 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
816 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
1da177e4
LT
817 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
818 if (!new_skb) {
819 ip_rt_put(rt);
0bfbedb1 820 txq->tx_dropped++;
1da177e4 821 dev_kfree_skb(skb);
6ed10654 822 return NETDEV_TX_OK;
1da177e4
LT
823 }
824 if (skb->sk)
825 skb_set_owner_w(new_skb, skb->sk);
826 dev_kfree_skb(skb);
827 skb = new_skb;
eddc9ec5 828 old_iph = ip_hdr(skb);
1da177e4
LT
829 }
830
64194c31 831 skb_reset_transport_header(skb);
e2d1bca7
ACM
832 skb_push(skb, gre_hlen);
833 skb_reset_network_header(skb);
1da177e4 834 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
48d5cad8
PM
835 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
836 IPSKB_REROUTED);
adf30907
ED
837 skb_dst_drop(skb);
838 skb_dst_set(skb, &rt->u.dst);
1da177e4
LT
839
840 /*
841 * Push down and install the IPIP header.
842 */
843
eddc9ec5 844 iph = ip_hdr(skb);
1da177e4
LT
845 iph->version = 4;
846 iph->ihl = sizeof(struct iphdr) >> 2;
847 iph->frag_off = df;
848 iph->protocol = IPPROTO_GRE;
849 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
850 iph->daddr = rt->rt_dst;
851 iph->saddr = rt->rt_src;
852
853 if ((iph->ttl = tiph->ttl) == 0) {
854 if (skb->protocol == htons(ETH_P_IP))
855 iph->ttl = old_iph->ttl;
856#ifdef CONFIG_IPV6
857 else if (skb->protocol == htons(ETH_P_IPV6))
6ed2533e 858 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
1da177e4
LT
859#endif
860 else
861 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
862 }
863
e1a80002
HX
864 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
865 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
866 htons(ETH_P_TEB) : skb->protocol;
1da177e4
LT
867
868 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
d5a0a1e3 869 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
1da177e4
LT
870
871 if (tunnel->parms.o_flags&GRE_SEQ) {
872 ++tunnel->o_seqno;
873 *ptr = htonl(tunnel->o_seqno);
874 ptr--;
875 }
876 if (tunnel->parms.o_flags&GRE_KEY) {
877 *ptr = tunnel->parms.o_key;
878 ptr--;
879 }
880 if (tunnel->parms.o_flags&GRE_CSUM) {
881 *ptr = 0;
5f92a738 882 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
1da177e4
LT
883 }
884 }
885
886 nf_reset(skb);
887
888 IPTUNNEL_XMIT();
6ed10654 889 return NETDEV_TX_OK;
1da177e4
LT
890
891tx_error_icmp:
892 dst_link_failure(skb);
893
894tx_error:
895 stats->tx_errors++;
896 dev_kfree_skb(skb);
6ed10654 897 return NETDEV_TX_OK;
1da177e4
LT
898}
899
42aa9162 900static int ipgre_tunnel_bind_dev(struct net_device *dev)
ee34c1eb
MS
901{
902 struct net_device *tdev = NULL;
903 struct ip_tunnel *tunnel;
904 struct iphdr *iph;
905 int hlen = LL_MAX_HEADER;
906 int mtu = ETH_DATA_LEN;
907 int addend = sizeof(struct iphdr) + 4;
908
909 tunnel = netdev_priv(dev);
910 iph = &tunnel->parms.iph;
911
c95b819a 912 /* Guess output device to choose reasonable mtu and needed_headroom */
ee34c1eb
MS
913
914 if (iph->daddr) {
915 struct flowi fl = { .oif = tunnel->parms.link,
916 .nl_u = { .ip4_u =
917 { .daddr = iph->daddr,
918 .saddr = iph->saddr,
919 .tos = RT_TOS(iph->tos) } },
920 .proto = IPPROTO_GRE };
921 struct rtable *rt;
96635522 922 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
ee34c1eb
MS
923 tdev = rt->u.dst.dev;
924 ip_rt_put(rt);
925 }
e1a80002
HX
926
927 if (dev->type != ARPHRD_ETHER)
928 dev->flags |= IFF_POINTOPOINT;
ee34c1eb
MS
929 }
930
931 if (!tdev && tunnel->parms.link)
96635522 932 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
ee34c1eb
MS
933
934 if (tdev) {
c95b819a 935 hlen = tdev->hard_header_len + tdev->needed_headroom;
ee34c1eb
MS
936 mtu = tdev->mtu;
937 }
938 dev->iflink = tunnel->parms.link;
939
940 /* Precalculate GRE options length */
941 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
942 if (tunnel->parms.o_flags&GRE_CSUM)
943 addend += 4;
944 if (tunnel->parms.o_flags&GRE_KEY)
945 addend += 4;
946 if (tunnel->parms.o_flags&GRE_SEQ)
947 addend += 4;
948 }
c95b819a 949 dev->needed_headroom = addend + hlen;
8cdb0456 950 mtu -= dev->hard_header_len + addend;
42aa9162
HX
951
952 if (mtu < 68)
953 mtu = 68;
954
ee34c1eb
MS
955 tunnel->hlen = addend;
956
42aa9162 957 return mtu;
ee34c1eb
MS
958}
959
1da177e4
LT
960static int
961ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
962{
963 int err = 0;
964 struct ip_tunnel_parm p;
965 struct ip_tunnel *t;
f57e7d5a
PE
966 struct net *net = dev_net(dev);
967 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1da177e4
LT
968
969 switch (cmd) {
970 case SIOCGETTUNNEL:
971 t = NULL;
7daa0004 972 if (dev == ign->fb_tunnel_dev) {
1da177e4
LT
973 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
974 err = -EFAULT;
975 break;
976 }
f57e7d5a 977 t = ipgre_tunnel_locate(net, &p, 0);
1da177e4
LT
978 }
979 if (t == NULL)
2941a486 980 t = netdev_priv(dev);
1da177e4
LT
981 memcpy(&p, &t->parms, sizeof(p));
982 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
983 err = -EFAULT;
984 break;
985
986 case SIOCADDTUNNEL:
987 case SIOCCHGTUNNEL:
988 err = -EPERM;
989 if (!capable(CAP_NET_ADMIN))
990 goto done;
991
992 err = -EFAULT;
993 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
994 goto done;
995
996 err = -EINVAL;
997 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
998 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
999 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1000 goto done;
1001 if (p.iph.ttl)
1002 p.iph.frag_off |= htons(IP_DF);
1003
1004 if (!(p.i_flags&GRE_KEY))
1005 p.i_key = 0;
1006 if (!(p.o_flags&GRE_KEY))
1007 p.o_key = 0;
1008
f57e7d5a 1009 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1da177e4 1010
7daa0004 1011 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1da177e4
LT
1012 if (t != NULL) {
1013 if (t->dev != dev) {
1014 err = -EEXIST;
1015 break;
1016 }
1017 } else {
6ed2533e 1018 unsigned nflags = 0;
1da177e4 1019
2941a486 1020 t = netdev_priv(dev);
1da177e4 1021
f97c1e0c 1022 if (ipv4_is_multicast(p.iph.daddr))
1da177e4
LT
1023 nflags = IFF_BROADCAST;
1024 else if (p.iph.daddr)
1025 nflags = IFF_POINTOPOINT;
1026
1027 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1028 err = -EINVAL;
1029 break;
1030 }
f57e7d5a 1031 ipgre_tunnel_unlink(ign, t);
1da177e4
LT
1032 t->parms.iph.saddr = p.iph.saddr;
1033 t->parms.iph.daddr = p.iph.daddr;
1034 t->parms.i_key = p.i_key;
1035 t->parms.o_key = p.o_key;
1036 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1037 memcpy(dev->broadcast, &p.iph.daddr, 4);
f57e7d5a 1038 ipgre_tunnel_link(ign, t);
1da177e4
LT
1039 netdev_state_change(dev);
1040 }
1041 }
1042
1043 if (t) {
1044 err = 0;
1045 if (cmd == SIOCCHGTUNNEL) {
1046 t->parms.iph.ttl = p.iph.ttl;
1047 t->parms.iph.tos = p.iph.tos;
1048 t->parms.iph.frag_off = p.iph.frag_off;
ee34c1eb
MS
1049 if (t->parms.link != p.link) {
1050 t->parms.link = p.link;
42aa9162 1051 dev->mtu = ipgre_tunnel_bind_dev(dev);
ee34c1eb
MS
1052 netdev_state_change(dev);
1053 }
1da177e4
LT
1054 }
1055 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1056 err = -EFAULT;
1057 } else
1058 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1059 break;
1060
1061 case SIOCDELTUNNEL:
1062 err = -EPERM;
1063 if (!capable(CAP_NET_ADMIN))
1064 goto done;
1065
7daa0004 1066 if (dev == ign->fb_tunnel_dev) {
1da177e4
LT
1067 err = -EFAULT;
1068 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1069 goto done;
1070 err = -ENOENT;
f57e7d5a 1071 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1da177e4
LT
1072 goto done;
1073 err = -EPERM;
7daa0004 1074 if (t == netdev_priv(ign->fb_tunnel_dev))
1da177e4
LT
1075 goto done;
1076 dev = t->dev;
1077 }
22f8cde5
SH
1078 unregister_netdevice(dev);
1079 err = 0;
1da177e4
LT
1080 break;
1081
1082 default:
1083 err = -EINVAL;
1084 }
1085
1086done:
1087 return err;
1088}
1089
1da177e4
LT
1090static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1091{
2941a486 1092 struct ip_tunnel *tunnel = netdev_priv(dev);
c95b819a
HX
1093 if (new_mtu < 68 ||
1094 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1da177e4
LT
1095 return -EINVAL;
1096 dev->mtu = new_mtu;
1097 return 0;
1098}
1099
1da177e4
LT
1100/* Nice toy. Unfortunately, useless in real life :-)
1101 It allows to construct virtual multiprotocol broadcast "LAN"
1102 over the Internet, provided multicast routing is tuned.
1103
1104
1105 I have no idea was this bicycle invented before me,
1106 so that I had to set ARPHRD_IPGRE to a random value.
1107 I have an impression, that Cisco could make something similar,
1108 but this feature is apparently missing in IOS<=11.2(8).
e905a9ed 1109
1da177e4
LT
1110 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1111 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1112
1113 ping -t 255 224.66.66.66
1114
1115 If nobody answers, mbone does not work.
1116
1117 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1118 ip addr add 10.66.66.<somewhat>/24 dev Universe
1119 ifconfig Universe up
1120 ifconfig Universe add fe80::<Your_real_addr>/10
1121 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1122 ftp 10.66.66.66
1123 ...
1124 ftp fec0:6666:6666::193.233.7.65
1125 ...
1126
1127 */
1128
3b04ddde
SH
1129static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1130 unsigned short type,
1131 const void *daddr, const void *saddr, unsigned len)
1da177e4 1132{
2941a486 1133 struct ip_tunnel *t = netdev_priv(dev);
1da177e4 1134 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
d5a0a1e3 1135 __be16 *p = (__be16*)(iph+1);
1da177e4
LT
1136
1137 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1138 p[0] = t->parms.o_flags;
1139 p[1] = htons(type);
1140
1141 /*
e905a9ed 1142 * Set the source hardware address.
1da177e4 1143 */
e905a9ed 1144
1da177e4
LT
1145 if (saddr)
1146 memcpy(&iph->saddr, saddr, 4);
1147
1148 if (daddr) {
1149 memcpy(&iph->daddr, daddr, 4);
1150 return t->hlen;
1151 }
f97c1e0c 1152 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1da177e4 1153 return t->hlen;
e905a9ed 1154
1da177e4
LT
1155 return -t->hlen;
1156}
1157
6a5f44d7
TT
1158static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1159{
6ed2533e 1160 struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
6a5f44d7
TT
1161 memcpy(haddr, &iph->saddr, 4);
1162 return 4;
1163}
1164
3b04ddde
SH
1165static const struct header_ops ipgre_header_ops = {
1166 .create = ipgre_header,
6a5f44d7 1167 .parse = ipgre_header_parse,
3b04ddde
SH
1168};
1169
6a5f44d7 1170#ifdef CONFIG_NET_IPGRE_BROADCAST
1da177e4
LT
1171static int ipgre_open(struct net_device *dev)
1172{
2941a486 1173 struct ip_tunnel *t = netdev_priv(dev);
1da177e4 1174
f97c1e0c 1175 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1da177e4
LT
1176 struct flowi fl = { .oif = t->parms.link,
1177 .nl_u = { .ip4_u =
1178 { .daddr = t->parms.iph.daddr,
1179 .saddr = t->parms.iph.saddr,
1180 .tos = RT_TOS(t->parms.iph.tos) } },
1181 .proto = IPPROTO_GRE };
1182 struct rtable *rt;
96635522 1183 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1da177e4
LT
1184 return -EADDRNOTAVAIL;
1185 dev = rt->u.dst.dev;
1186 ip_rt_put(rt);
e5ed6399 1187 if (__in_dev_get_rtnl(dev) == NULL)
1da177e4
LT
1188 return -EADDRNOTAVAIL;
1189 t->mlink = dev->ifindex;
e5ed6399 1190 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1da177e4
LT
1191 }
1192 return 0;
1193}
1194
1195static int ipgre_close(struct net_device *dev)
1196{
2941a486 1197 struct ip_tunnel *t = netdev_priv(dev);
b8c26a33 1198
f97c1e0c 1199 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
7fee0ca2 1200 struct in_device *in_dev;
c346dca1 1201 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1da177e4
LT
1202 if (in_dev) {
1203 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1204 in_dev_put(in_dev);
1205 }
1206 }
1207 return 0;
1208}
1209
1210#endif
1211
b8c26a33
SH
1212static const struct net_device_ops ipgre_netdev_ops = {
1213 .ndo_init = ipgre_tunnel_init,
1214 .ndo_uninit = ipgre_tunnel_uninit,
1215#ifdef CONFIG_NET_IPGRE_BROADCAST
1216 .ndo_open = ipgre_open,
1217 .ndo_stop = ipgre_close,
1218#endif
1219 .ndo_start_xmit = ipgre_tunnel_xmit,
1220 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1221 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1222};
1223
1da177e4
LT
1224static void ipgre_tunnel_setup(struct net_device *dev)
1225{
b8c26a33 1226 dev->netdev_ops = &ipgre_netdev_ops;
1da177e4 1227 dev->destructor = free_netdev;
1da177e4
LT
1228
1229 dev->type = ARPHRD_IPGRE;
c95b819a 1230 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
46f25dff 1231 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1da177e4
LT
1232 dev->flags = IFF_NOARP;
1233 dev->iflink = 0;
1234 dev->addr_len = 4;
0b67eceb 1235 dev->features |= NETIF_F_NETNS_LOCAL;
108bfa89 1236 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1da177e4
LT
1237}
1238
1239static int ipgre_tunnel_init(struct net_device *dev)
1240{
1da177e4
LT
1241 struct ip_tunnel *tunnel;
1242 struct iphdr *iph;
1da177e4 1243
2941a486 1244 tunnel = netdev_priv(dev);
1da177e4
LT
1245 iph = &tunnel->parms.iph;
1246
1247 tunnel->dev = dev;
1248 strcpy(tunnel->parms.name, dev->name);
1249
1250 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1251 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1252
1da177e4 1253 if (iph->daddr) {
1da177e4 1254#ifdef CONFIG_NET_IPGRE_BROADCAST
f97c1e0c 1255 if (ipv4_is_multicast(iph->daddr)) {
1da177e4
LT
1256 if (!iph->saddr)
1257 return -EINVAL;
1258 dev->flags = IFF_BROADCAST;
3b04ddde 1259 dev->header_ops = &ipgre_header_ops;
1da177e4
LT
1260 }
1261#endif
ee34c1eb 1262 } else
6a5f44d7 1263 dev->header_ops = &ipgre_header_ops;
1da177e4 1264
1da177e4
LT
1265 return 0;
1266}
1267
b8c26a33 1268static void ipgre_fb_tunnel_init(struct net_device *dev)
1da177e4 1269{
2941a486 1270 struct ip_tunnel *tunnel = netdev_priv(dev);
1da177e4 1271 struct iphdr *iph = &tunnel->parms.iph;
eb8ce741 1272 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1da177e4
LT
1273
1274 tunnel->dev = dev;
1275 strcpy(tunnel->parms.name, dev->name);
1276
1277 iph->version = 4;
1278 iph->protocol = IPPROTO_GRE;
1279 iph->ihl = 5;
1280 tunnel->hlen = sizeof(struct iphdr) + 4;
1281
1282 dev_hold(dev);
eb8ce741 1283 ign->tunnels_wc[0] = tunnel;
1da177e4
LT
1284}
1285
1286
32613090 1287static const struct net_protocol ipgre_protocol = {
1da177e4
LT
1288 .handler = ipgre_rcv,
1289 .err_handler = ipgre_err,
f96c148f 1290 .netns_ok = 1,
1da177e4
LT
1291};
1292
eb8ce741
PE
1293static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1294{
1295 int prio;
1296
1297 for (prio = 0; prio < 4; prio++) {
1298 int h;
1299 for (h = 0; h < HASH_SIZE; h++) {
1300 struct ip_tunnel *t;
1301 while ((t = ign->tunnels[prio][h]) != NULL)
1302 unregister_netdevice(t->dev);
1303 }
1304 }
1305}
1306
59a4c759
PE
1307static int ipgre_init_net(struct net *net)
1308{
1309 int err;
1310 struct ipgre_net *ign;
1311
1312 err = -ENOMEM;
eb8ce741 1313 ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
59a4c759
PE
1314 if (ign == NULL)
1315 goto err_alloc;
1316
1317 err = net_assign_generic(net, ipgre_net_id, ign);
1318 if (err < 0)
1319 goto err_assign;
1320
7daa0004
PE
1321 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1322 ipgre_tunnel_setup);
1323 if (!ign->fb_tunnel_dev) {
1324 err = -ENOMEM;
1325 goto err_alloc_dev;
1326 }
be77e593 1327 dev_net_set(ign->fb_tunnel_dev, net);
7daa0004 1328
b8c26a33 1329 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
c19e654d 1330 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
7daa0004
PE
1331
1332 if ((err = register_netdev(ign->fb_tunnel_dev)))
1333 goto err_reg_dev;
1334
59a4c759
PE
1335 return 0;
1336
7daa0004
PE
1337err_reg_dev:
1338 free_netdev(ign->fb_tunnel_dev);
1339err_alloc_dev:
1340 /* nothing */
59a4c759
PE
1341err_assign:
1342 kfree(ign);
1343err_alloc:
1344 return err;
1345}
1346
1347static void ipgre_exit_net(struct net *net)
1348{
1349 struct ipgre_net *ign;
1350
1351 ign = net_generic(net, ipgre_net_id);
7daa0004 1352 rtnl_lock();
eb8ce741 1353 ipgre_destroy_tunnels(ign);
7daa0004 1354 rtnl_unlock();
59a4c759
PE
1355 kfree(ign);
1356}
1357
1358static struct pernet_operations ipgre_net_ops = {
1359 .init = ipgre_init_net,
1360 .exit = ipgre_exit_net,
1361};
1da177e4 1362
c19e654d
HX
1363static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1364{
1365 __be16 flags;
1366
1367 if (!data)
1368 return 0;
1369
1370 flags = 0;
1371 if (data[IFLA_GRE_IFLAGS])
1372 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1373 if (data[IFLA_GRE_OFLAGS])
1374 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1375 if (flags & (GRE_VERSION|GRE_ROUTING))
1376 return -EINVAL;
1377
1378 return 0;
1379}
1380
e1a80002
HX
1381static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1382{
1383 __be32 daddr;
1384
1385 if (tb[IFLA_ADDRESS]) {
1386 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1387 return -EINVAL;
1388 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1389 return -EADDRNOTAVAIL;
1390 }
1391
1392 if (!data)
1393 goto out;
1394
1395 if (data[IFLA_GRE_REMOTE]) {
1396 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1397 if (!daddr)
1398 return -EINVAL;
1399 }
1400
1401out:
1402 return ipgre_tunnel_validate(tb, data);
1403}
1404
c19e654d
HX
1405static void ipgre_netlink_parms(struct nlattr *data[],
1406 struct ip_tunnel_parm *parms)
1407{
7bb82d92 1408 memset(parms, 0, sizeof(*parms));
c19e654d
HX
1409
1410 parms->iph.protocol = IPPROTO_GRE;
1411
1412 if (!data)
1413 return;
1414
1415 if (data[IFLA_GRE_LINK])
1416 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1417
1418 if (data[IFLA_GRE_IFLAGS])
1419 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1420
1421 if (data[IFLA_GRE_OFLAGS])
1422 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1423
1424 if (data[IFLA_GRE_IKEY])
1425 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1426
1427 if (data[IFLA_GRE_OKEY])
1428 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1429
1430 if (data[IFLA_GRE_LOCAL])
4d74f8ba 1431 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
c19e654d
HX
1432
1433 if (data[IFLA_GRE_REMOTE])
4d74f8ba 1434 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
c19e654d
HX
1435
1436 if (data[IFLA_GRE_TTL])
1437 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1438
1439 if (data[IFLA_GRE_TOS])
1440 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1441
1442 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1443 parms->iph.frag_off = htons(IP_DF);
1444}
1445
e1a80002
HX
1446static int ipgre_tap_init(struct net_device *dev)
1447{
1448 struct ip_tunnel *tunnel;
1449
1450 tunnel = netdev_priv(dev);
1451
1452 tunnel->dev = dev;
1453 strcpy(tunnel->parms.name, dev->name);
1454
1455 ipgre_tunnel_bind_dev(dev);
1456
1457 return 0;
1458}
1459
b8c26a33
SH
1460static const struct net_device_ops ipgre_tap_netdev_ops = {
1461 .ndo_init = ipgre_tap_init,
1462 .ndo_uninit = ipgre_tunnel_uninit,
1463 .ndo_start_xmit = ipgre_tunnel_xmit,
1464 .ndo_set_mac_address = eth_mac_addr,
1465 .ndo_validate_addr = eth_validate_addr,
1466 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1467};
1468
e1a80002
HX
1469static void ipgre_tap_setup(struct net_device *dev)
1470{
1471
1472 ether_setup(dev);
1473
b8c26a33 1474 dev->netdev_ops = &ipgre_netdev_ops;
e1a80002 1475 dev->destructor = free_netdev;
e1a80002
HX
1476
1477 dev->iflink = 0;
1478 dev->features |= NETIF_F_NETNS_LOCAL;
1479}
1480
c19e654d
HX
1481static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1482 struct nlattr *data[])
1483{
1484 struct ip_tunnel *nt;
1485 struct net *net = dev_net(dev);
1486 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1487 int mtu;
1488 int err;
1489
1490 nt = netdev_priv(dev);
1491 ipgre_netlink_parms(data, &nt->parms);
1492
e1a80002 1493 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
c19e654d
HX
1494 return -EEXIST;
1495
e1a80002
HX
1496 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1497 random_ether_addr(dev->dev_addr);
1498
c19e654d
HX
1499 mtu = ipgre_tunnel_bind_dev(dev);
1500 if (!tb[IFLA_MTU])
1501 dev->mtu = mtu;
1502
1503 err = register_netdevice(dev);
1504 if (err)
1505 goto out;
1506
1507 dev_hold(dev);
1508 ipgre_tunnel_link(ign, nt);
1509
1510out:
1511 return err;
1512}
1513
1514static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1515 struct nlattr *data[])
1516{
1517 struct ip_tunnel *t, *nt;
1518 struct net *net = dev_net(dev);
1519 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1520 struct ip_tunnel_parm p;
1521 int mtu;
1522
1523 if (dev == ign->fb_tunnel_dev)
1524 return -EINVAL;
1525
1526 nt = netdev_priv(dev);
1527 ipgre_netlink_parms(data, &p);
1528
1529 t = ipgre_tunnel_locate(net, &p, 0);
1530
1531 if (t) {
1532 if (t->dev != dev)
1533 return -EEXIST;
1534 } else {
1535 unsigned nflags = 0;
1536
1537 t = nt;
1538
1539 if (ipv4_is_multicast(p.iph.daddr))
1540 nflags = IFF_BROADCAST;
1541 else if (p.iph.daddr)
1542 nflags = IFF_POINTOPOINT;
1543
1544 if ((dev->flags ^ nflags) &
1545 (IFF_POINTOPOINT | IFF_BROADCAST))
1546 return -EINVAL;
1547
1548 ipgre_tunnel_unlink(ign, t);
1549 t->parms.iph.saddr = p.iph.saddr;
1550 t->parms.iph.daddr = p.iph.daddr;
1551 t->parms.i_key = p.i_key;
1552 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1553 memcpy(dev->broadcast, &p.iph.daddr, 4);
1554 ipgre_tunnel_link(ign, t);
1555 netdev_state_change(dev);
1556 }
1557
1558 t->parms.o_key = p.o_key;
1559 t->parms.iph.ttl = p.iph.ttl;
1560 t->parms.iph.tos = p.iph.tos;
1561 t->parms.iph.frag_off = p.iph.frag_off;
1562
1563 if (t->parms.link != p.link) {
1564 t->parms.link = p.link;
1565 mtu = ipgre_tunnel_bind_dev(dev);
1566 if (!tb[IFLA_MTU])
1567 dev->mtu = mtu;
1568 netdev_state_change(dev);
1569 }
1570
1571 return 0;
1572}
1573
1574static size_t ipgre_get_size(const struct net_device *dev)
1575{
1576 return
1577 /* IFLA_GRE_LINK */
1578 nla_total_size(4) +
1579 /* IFLA_GRE_IFLAGS */
1580 nla_total_size(2) +
1581 /* IFLA_GRE_OFLAGS */
1582 nla_total_size(2) +
1583 /* IFLA_GRE_IKEY */
1584 nla_total_size(4) +
1585 /* IFLA_GRE_OKEY */
1586 nla_total_size(4) +
1587 /* IFLA_GRE_LOCAL */
1588 nla_total_size(4) +
1589 /* IFLA_GRE_REMOTE */
1590 nla_total_size(4) +
1591 /* IFLA_GRE_TTL */
1592 nla_total_size(1) +
1593 /* IFLA_GRE_TOS */
1594 nla_total_size(1) +
1595 /* IFLA_GRE_PMTUDISC */
1596 nla_total_size(1) +
1597 0;
1598}
1599
1600static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1601{
1602 struct ip_tunnel *t = netdev_priv(dev);
1603 struct ip_tunnel_parm *p = &t->parms;
1604
1605 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1606 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1607 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
ba9e64b1
PM
1608 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1609 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
4d74f8ba
PM
1610 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1611 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
c19e654d
HX
1612 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1613 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1614 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1615
1616 return 0;
1617
1618nla_put_failure:
1619 return -EMSGSIZE;
1620}
1621
1622static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1623 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1624 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1625 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1626 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1627 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
4d74f8ba
PM
1628 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1629 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
c19e654d
HX
1630 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1631 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1632 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1633};
1634
1635static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1636 .kind = "gre",
1637 .maxtype = IFLA_GRE_MAX,
1638 .policy = ipgre_policy,
1639 .priv_size = sizeof(struct ip_tunnel),
1640 .setup = ipgre_tunnel_setup,
1641 .validate = ipgre_tunnel_validate,
1642 .newlink = ipgre_newlink,
1643 .changelink = ipgre_changelink,
1644 .get_size = ipgre_get_size,
1645 .fill_info = ipgre_fill_info,
1646};
1647
e1a80002
HX
1648static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1649 .kind = "gretap",
1650 .maxtype = IFLA_GRE_MAX,
1651 .policy = ipgre_policy,
1652 .priv_size = sizeof(struct ip_tunnel),
1653 .setup = ipgre_tap_setup,
1654 .validate = ipgre_tap_validate,
1655 .newlink = ipgre_newlink,
1656 .changelink = ipgre_changelink,
1657 .get_size = ipgre_get_size,
1658 .fill_info = ipgre_fill_info,
1659};
1660
1da177e4
LT
1661/*
1662 * And now the modules code and kernel interface.
1663 */
1664
1665static int __init ipgre_init(void)
1666{
1667 int err;
1668
1669 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1670
1671 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1672 printk(KERN_INFO "ipgre init: can't add protocol\n");
1673 return -EAGAIN;
1674 }
1675
59a4c759
PE
1676 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1677 if (err < 0)
c19e654d 1678 goto gen_device_failed;
7daa0004 1679
c19e654d
HX
1680 err = rtnl_link_register(&ipgre_link_ops);
1681 if (err < 0)
1682 goto rtnl_link_failed;
1683
e1a80002
HX
1684 err = rtnl_link_register(&ipgre_tap_ops);
1685 if (err < 0)
1686 goto tap_ops_failed;
1687
c19e654d 1688out:
1da177e4 1689 return err;
c19e654d 1690
e1a80002
HX
1691tap_ops_failed:
1692 rtnl_link_unregister(&ipgre_link_ops);
c19e654d
HX
1693rtnl_link_failed:
1694 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1695gen_device_failed:
1696 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1697 goto out;
1da177e4
LT
1698}
1699
db44575f 1700static void __exit ipgre_fini(void)
1da177e4 1701{
e1a80002 1702 rtnl_link_unregister(&ipgre_tap_ops);
c19e654d
HX
1703 rtnl_link_unregister(&ipgre_link_ops);
1704 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1da177e4
LT
1705 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1706 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1da177e4
LT
1707}
1708
1709module_init(ipgre_init);
1710module_exit(ipgre_fini);
1711MODULE_LICENSE("GPL");
4d74f8ba
PM
1712MODULE_ALIAS_RTNL_LINK("gre");
1713MODULE_ALIAS_RTNL_LINK("gretap");