]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/ip_gre.c
ip6tnl: convert hash tables locking to RCU
[net-next-2.6.git] / net / ipv4 / ip_gre.c
CommitLineData
1da177e4 1/*
e905a9ed 2 * Linux NET3: GRE over IP protocol decoder.
1da177e4
LT
3 *
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
4fc268d2 13#include <linux/capability.h>
1da177e4
LT
14#include <linux/module.h>
15#include <linux/types.h>
1da177e4
LT
16#include <linux/kernel.h>
17#include <asm/uaccess.h>
18#include <linux/skbuff.h>
19#include <linux/netdevice.h>
20#include <linux/in.h>
21#include <linux/tcp.h>
22#include <linux/udp.h>
23#include <linux/if_arp.h>
24#include <linux/mroute.h>
25#include <linux/init.h>
26#include <linux/in6.h>
27#include <linux/inetdevice.h>
28#include <linux/igmp.h>
29#include <linux/netfilter_ipv4.h>
e1a80002 30#include <linux/etherdevice.h>
46f25dff 31#include <linux/if_ether.h>
1da177e4
LT
32
33#include <net/sock.h>
34#include <net/ip.h>
35#include <net/icmp.h>
36#include <net/protocol.h>
37#include <net/ipip.h>
38#include <net/arp.h>
39#include <net/checksum.h>
40#include <net/dsfield.h>
41#include <net/inet_ecn.h>
42#include <net/xfrm.h>
59a4c759
PE
43#include <net/net_namespace.h>
44#include <net/netns/generic.h>
c19e654d 45#include <net/rtnetlink.h>
1da177e4
LT
46
47#ifdef CONFIG_IPV6
48#include <net/ipv6.h>
49#include <net/ip6_fib.h>
50#include <net/ip6_route.h>
51#endif
52
53/*
54 Problems & solutions
55 --------------------
56
57 1. The most important issue is detecting local dead loops.
58 They would cause complete host lockup in transmit, which
59 would be "resolved" by stack overflow or, if queueing is enabled,
60 with infinite looping in net_bh.
61
62 We cannot track such dead loops during route installation,
63 it is infeasible task. The most general solutions would be
64 to keep skb->encapsulation counter (sort of local ttl),
65 and silently drop packet when it expires. It is the best
66 solution, but it supposes maintaing new variable in ALL
67 skb, even if no tunneling is used.
68
a43912ab 69 Current solution: HARD_TX_LOCK lock breaks dead loops.
1da177e4
LT
70
71
72
73 2. Networking dead loops would not kill routers, but would really
74 kill network. IP hop limit plays role of "t->recursion" in this case,
75 if we copy it from packet being encapsulated to upper header.
76 It is very good solution, but it introduces two problems:
77
78 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79 do not work over tunnels.
80 - traceroute does not work. I planned to relay ICMP from tunnel,
81 so that this problem would be solved and traceroute output
82 would even more informative. This idea appeared to be wrong:
83 only Linux complies to rfc1812 now (yes, guys, Linux is the only
84 true router now :-)), all routers (at least, in neighbourhood of mine)
85 return only 8 bytes of payload. It is the end.
86
87 Hence, if we want that OSPF worked or traceroute said something reasonable,
88 we should search for another solution.
89
90 One of them is to parse packet trying to detect inner encapsulation
91 made by our node. It is difficult or even impossible, especially,
92 taking into account fragmentation. TO be short, tt is not solution at all.
93
94 Current solution: The solution was UNEXPECTEDLY SIMPLE.
95 We force DF flag on tunnels with preconfigured hop limit,
96 that is ALL. :-) Well, it does not remove the problem completely,
97 but exponential growth of network traffic is changed to linear
98 (branches, that exceed pmtu are pruned) and tunnel mtu
99 fastly degrades to value <68, where looping stops.
100 Yes, it is not good if there exists a router in the loop,
101 which does not force DF, even when encapsulating packets have DF set.
102 But it is not our problem! Nobody could accuse us, we made
103 all that we could make. Even if it is your gated who injected
104 fatal route to network, even if it were you who configured
105 fatal static route: you are innocent. :-)
106
107
108
109 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
110 practically identical code. It would be good to glue them
111 together, but it is not very evident, how to make them modular.
112 sit is integral part of IPv6, ipip and gre are naturally modular.
113 We could extract common parts (hash table, ioctl etc)
114 to a separate module (ip_tunnel.c).
115
116 Alexey Kuznetsov.
117 */
118
c19e654d 119static struct rtnl_link_ops ipgre_link_ops __read_mostly;
1da177e4
LT
120static int ipgre_tunnel_init(struct net_device *dev);
121static void ipgre_tunnel_setup(struct net_device *dev);
42aa9162 122static int ipgre_tunnel_bind_dev(struct net_device *dev);
1da177e4
LT
123
124/* Fallback tunnel: no source, no destination, no key, no options */
125
eb8ce741
PE
126#define HASH_SIZE 16
127
59a4c759
PE
128static int ipgre_net_id;
129struct ipgre_net {
eb8ce741
PE
130 struct ip_tunnel *tunnels[4][HASH_SIZE];
131
7daa0004 132 struct net_device *fb_tunnel_dev;
59a4c759
PE
133};
134
1da177e4
LT
135/* Tunnel hash table */
136
137/*
138 4 hash tables:
139
140 3: (remote,local)
141 2: (remote,*)
142 1: (*,local)
143 0: (*,*)
144
145 We require exact key match i.e. if a key is present in packet
146 it will match only tunnel with the same key; if it is not present,
147 it will match only keyless tunnel.
148
149 All keysless packets, if not matched configured keyless tunnels
150 will match fallback tunnel.
151 */
152
d5a0a1e3 153#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
1da177e4 154
eb8ce741
PE
155#define tunnels_r_l tunnels[3]
156#define tunnels_r tunnels[2]
157#define tunnels_l tunnels[1]
158#define tunnels_wc tunnels[0]
1da177e4
LT
159
160static DEFINE_RWLOCK(ipgre_lock);
161
162/* Given src, dst and key, find appropriate for input tunnel. */
163
749c10f9 164static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
e1a80002
HX
165 __be32 remote, __be32 local,
166 __be32 key, __be16 gre_proto)
1da177e4 167{
749c10f9
TT
168 struct net *net = dev_net(dev);
169 int link = dev->ifindex;
1da177e4
LT
170 unsigned h0 = HASH(remote);
171 unsigned h1 = HASH(key);
afcf1242 172 struct ip_tunnel *t, *cand = NULL;
7daa0004 173 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
e1a80002
HX
174 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
175 ARPHRD_ETHER : ARPHRD_IPGRE;
afcf1242 176 int score, cand_score = 4;
1da177e4 177
eb8ce741 178 for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
749c10f9
TT
179 if (local != t->parms.iph.saddr ||
180 remote != t->parms.iph.daddr ||
181 key != t->parms.i_key ||
182 !(t->dev->flags & IFF_UP))
183 continue;
184
185 if (t->dev->type != ARPHRD_IPGRE &&
186 t->dev->type != dev_type)
187 continue;
188
afcf1242 189 score = 0;
749c10f9 190 if (t->parms.link != link)
afcf1242 191 score |= 1;
749c10f9 192 if (t->dev->type != dev_type)
afcf1242
TT
193 score |= 2;
194 if (score == 0)
749c10f9 195 return t;
afcf1242
TT
196
197 if (score < cand_score) {
198 cand = t;
199 cand_score = score;
200 }
1da177e4 201 }
e1a80002 202
eb8ce741 203 for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
749c10f9
TT
204 if (remote != t->parms.iph.daddr ||
205 key != t->parms.i_key ||
206 !(t->dev->flags & IFF_UP))
207 continue;
208
209 if (t->dev->type != ARPHRD_IPGRE &&
210 t->dev->type != dev_type)
211 continue;
212
afcf1242 213 score = 0;
749c10f9 214 if (t->parms.link != link)
afcf1242 215 score |= 1;
749c10f9 216 if (t->dev->type != dev_type)
afcf1242
TT
217 score |= 2;
218 if (score == 0)
749c10f9 219 return t;
afcf1242
TT
220
221 if (score < cand_score) {
222 cand = t;
223 cand_score = score;
224 }
1da177e4 225 }
e1a80002 226
eb8ce741 227 for (t = ign->tunnels_l[h1]; t; t = t->next) {
749c10f9
TT
228 if ((local != t->parms.iph.saddr &&
229 (local != t->parms.iph.daddr ||
230 !ipv4_is_multicast(local))) ||
231 key != t->parms.i_key ||
232 !(t->dev->flags & IFF_UP))
233 continue;
234
235 if (t->dev->type != ARPHRD_IPGRE &&
236 t->dev->type != dev_type)
237 continue;
238
afcf1242 239 score = 0;
749c10f9 240 if (t->parms.link != link)
afcf1242 241 score |= 1;
749c10f9 242 if (t->dev->type != dev_type)
afcf1242
TT
243 score |= 2;
244 if (score == 0)
749c10f9 245 return t;
afcf1242
TT
246
247 if (score < cand_score) {
248 cand = t;
249 cand_score = score;
250 }
1da177e4 251 }
e1a80002 252
eb8ce741 253 for (t = ign->tunnels_wc[h1]; t; t = t->next) {
749c10f9
TT
254 if (t->parms.i_key != key ||
255 !(t->dev->flags & IFF_UP))
256 continue;
257
258 if (t->dev->type != ARPHRD_IPGRE &&
259 t->dev->type != dev_type)
260 continue;
261
afcf1242 262 score = 0;
749c10f9 263 if (t->parms.link != link)
afcf1242 264 score |= 1;
749c10f9 265 if (t->dev->type != dev_type)
afcf1242
TT
266 score |= 2;
267 if (score == 0)
749c10f9 268 return t;
afcf1242
TT
269
270 if (score < cand_score) {
271 cand = t;
272 cand_score = score;
273 }
1da177e4
LT
274 }
275
afcf1242
TT
276 if (cand != NULL)
277 return cand;
e1a80002 278
749c10f9 279 if (ign->fb_tunnel_dev->flags & IFF_UP)
7daa0004 280 return netdev_priv(ign->fb_tunnel_dev);
749c10f9 281
1da177e4
LT
282 return NULL;
283}
284
f57e7d5a
PE
285static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
286 struct ip_tunnel_parm *parms)
1da177e4 287{
5056a1ef
YH
288 __be32 remote = parms->iph.daddr;
289 __be32 local = parms->iph.saddr;
290 __be32 key = parms->i_key;
1da177e4
LT
291 unsigned h = HASH(key);
292 int prio = 0;
293
294 if (local)
295 prio |= 1;
f97c1e0c 296 if (remote && !ipv4_is_multicast(remote)) {
1da177e4
LT
297 prio |= 2;
298 h ^= HASH(remote);
299 }
300
eb8ce741 301 return &ign->tunnels[prio][h];
1da177e4
LT
302}
303
f57e7d5a
PE
304static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
305 struct ip_tunnel *t)
5056a1ef 306{
f57e7d5a 307 return __ipgre_bucket(ign, &t->parms);
5056a1ef
YH
308}
309
f57e7d5a 310static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
1da177e4 311{
f57e7d5a 312 struct ip_tunnel **tp = ipgre_bucket(ign, t);
1da177e4
LT
313
314 t->next = *tp;
315 write_lock_bh(&ipgre_lock);
316 *tp = t;
317 write_unlock_bh(&ipgre_lock);
318}
319
f57e7d5a 320static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
1da177e4
LT
321{
322 struct ip_tunnel **tp;
323
f57e7d5a 324 for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
1da177e4
LT
325 if (t == *tp) {
326 write_lock_bh(&ipgre_lock);
327 *tp = t->next;
328 write_unlock_bh(&ipgre_lock);
329 break;
330 }
331 }
332}
333
e1a80002
HX
334static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
335 struct ip_tunnel_parm *parms,
336 int type)
1da177e4 337{
d5a0a1e3
AV
338 __be32 remote = parms->iph.daddr;
339 __be32 local = parms->iph.saddr;
340 __be32 key = parms->i_key;
749c10f9 341 int link = parms->link;
e1a80002
HX
342 struct ip_tunnel *t, **tp;
343 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
344
345 for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
346 if (local == t->parms.iph.saddr &&
347 remote == t->parms.iph.daddr &&
348 key == t->parms.i_key &&
749c10f9 349 link == t->parms.link &&
e1a80002
HX
350 type == t->dev->type)
351 break;
352
353 return t;
354}
355
356static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
357 struct ip_tunnel_parm *parms, int create)
358{
359 struct ip_tunnel *t, *nt;
1da177e4 360 struct net_device *dev;
1da177e4 361 char name[IFNAMSIZ];
f57e7d5a 362 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1da177e4 363
e1a80002
HX
364 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
365 if (t || !create)
366 return t;
1da177e4
LT
367
368 if (parms->name[0])
369 strlcpy(name, parms->name, IFNAMSIZ);
34cc7ba6
PE
370 else
371 sprintf(name, "gre%%d");
1da177e4
LT
372
373 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
374 if (!dev)
375 return NULL;
376
0b67eceb
PE
377 dev_net_set(dev, net);
378
b37d428b
PE
379 if (strchr(name, '%')) {
380 if (dev_alloc_name(dev, name) < 0)
381 goto failed_free;
382 }
383
2941a486 384 nt = netdev_priv(dev);
1da177e4 385 nt->parms = *parms;
c19e654d 386 dev->rtnl_link_ops = &ipgre_link_ops;
1da177e4 387
42aa9162
HX
388 dev->mtu = ipgre_tunnel_bind_dev(dev);
389
b37d428b
PE
390 if (register_netdevice(dev) < 0)
391 goto failed_free;
1da177e4 392
1da177e4 393 dev_hold(dev);
f57e7d5a 394 ipgre_tunnel_link(ign, nt);
1da177e4
LT
395 return nt;
396
b37d428b
PE
397failed_free:
398 free_netdev(dev);
1da177e4
LT
399 return NULL;
400}
401
402static void ipgre_tunnel_uninit(struct net_device *dev)
403{
f57e7d5a
PE
404 struct net *net = dev_net(dev);
405 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
406
407 ipgre_tunnel_unlink(ign, netdev_priv(dev));
1da177e4
LT
408 dev_put(dev);
409}
410
411
412static void ipgre_err(struct sk_buff *skb, u32 info)
413{
1da177e4 414
071f92d0 415/* All the routers (except for Linux) return only
1da177e4
LT
416 8 bytes of packet payload. It means, that precise relaying of
417 ICMP in the real Internet is absolutely infeasible.
418
419 Moreover, Cisco "wise men" put GRE key to the third word
420 in GRE header. It makes impossible maintaining even soft state for keyed
421 GRE tunnels with enabled checksum. Tell them "thank you".
422
423 Well, I wonder, rfc1812 was written by Cisco employee,
424 what the hell these idiots break standrads established
425 by themself???
426 */
427
6ed2533e 428 struct iphdr *iph = (struct iphdr *)skb->data;
d5a0a1e3 429 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
1da177e4 430 int grehlen = (iph->ihl<<2) + 4;
88c7664f
ACM
431 const int type = icmp_hdr(skb)->type;
432 const int code = icmp_hdr(skb)->code;
1da177e4 433 struct ip_tunnel *t;
d5a0a1e3 434 __be16 flags;
1da177e4
LT
435
436 flags = p[0];
437 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
438 if (flags&(GRE_VERSION|GRE_ROUTING))
439 return;
440 if (flags&GRE_KEY) {
441 grehlen += 4;
442 if (flags&GRE_CSUM)
443 grehlen += 4;
444 }
445 }
446
447 /* If only 8 bytes returned, keyed message will be dropped here */
448 if (skb_headlen(skb) < grehlen)
449 return;
450
451 switch (type) {
452 default:
453 case ICMP_PARAMETERPROB:
454 return;
455
456 case ICMP_DEST_UNREACH:
457 switch (code) {
458 case ICMP_SR_FAILED:
459 case ICMP_PORT_UNREACH:
460 /* Impossible event. */
461 return;
462 case ICMP_FRAG_NEEDED:
463 /* Soft state for pmtu is maintained by IP core. */
464 return;
465 default:
466 /* All others are translated to HOST_UNREACH.
467 rfc2003 contains "deep thoughts" about NET_UNREACH,
468 I believe they are just ether pollution. --ANK
469 */
470 break;
471 }
472 break;
473 case ICMP_TIME_EXCEEDED:
474 if (code != ICMP_EXC_TTL)
475 return;
476 break;
477 }
478
479 read_lock(&ipgre_lock);
749c10f9 480 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
e1a80002
HX
481 flags & GRE_KEY ?
482 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
483 p[1]);
f97c1e0c
JP
484 if (t == NULL || t->parms.iph.daddr == 0 ||
485 ipv4_is_multicast(t->parms.iph.daddr))
1da177e4
LT
486 goto out;
487
488 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
489 goto out;
490
da6185d8 491 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
1da177e4
LT
492 t->err_count++;
493 else
494 t->err_count = 1;
495 t->err_time = jiffies;
496out:
497 read_unlock(&ipgre_lock);
498 return;
1da177e4
LT
499}
500
501static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
502{
503 if (INET_ECN_is_ce(iph->tos)) {
504 if (skb->protocol == htons(ETH_P_IP)) {
eddc9ec5 505 IP_ECN_set_ce(ip_hdr(skb));
1da177e4 506 } else if (skb->protocol == htons(ETH_P_IPV6)) {
0660e03f 507 IP6_ECN_set_ce(ipv6_hdr(skb));
1da177e4
LT
508 }
509 }
510}
511
512static inline u8
513ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
514{
515 u8 inner = 0;
516 if (skb->protocol == htons(ETH_P_IP))
517 inner = old_iph->tos;
518 else if (skb->protocol == htons(ETH_P_IPV6))
519 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
520 return INET_ECN_encapsulate(tos, inner);
521}
522
523static int ipgre_rcv(struct sk_buff *skb)
524{
525 struct iphdr *iph;
526 u8 *h;
d5a0a1e3 527 __be16 flags;
d3bc23e7 528 __sum16 csum = 0;
d5a0a1e3 529 __be32 key = 0;
1da177e4
LT
530 u32 seqno = 0;
531 struct ip_tunnel *tunnel;
532 int offset = 4;
e1a80002 533 __be16 gre_proto;
64194c31 534 unsigned int len;
1da177e4
LT
535
536 if (!pskb_may_pull(skb, 16))
537 goto drop_nolock;
538
eddc9ec5 539 iph = ip_hdr(skb);
1da177e4 540 h = skb->data;
d5a0a1e3 541 flags = *(__be16*)h;
1da177e4
LT
542
543 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
544 /* - Version must be 0.
545 - We do not support routing headers.
546 */
547 if (flags&(GRE_VERSION|GRE_ROUTING))
548 goto drop_nolock;
549
550 if (flags&GRE_CSUM) {
fb286bb2 551 switch (skb->ip_summed) {
84fa7933 552 case CHECKSUM_COMPLETE:
d3bc23e7 553 csum = csum_fold(skb->csum);
fb286bb2
HX
554 if (!csum)
555 break;
556 /* fall through */
557 case CHECKSUM_NONE:
558 skb->csum = 0;
559 csum = __skb_checksum_complete(skb);
84fa7933 560 skb->ip_summed = CHECKSUM_COMPLETE;
1da177e4
LT
561 }
562 offset += 4;
563 }
564 if (flags&GRE_KEY) {
d5a0a1e3 565 key = *(__be32*)(h + offset);
1da177e4
LT
566 offset += 4;
567 }
568 if (flags&GRE_SEQ) {
d5a0a1e3 569 seqno = ntohl(*(__be32*)(h + offset));
1da177e4
LT
570 offset += 4;
571 }
572 }
573
e1a80002
HX
574 gre_proto = *(__be16 *)(h + 2);
575
1da177e4 576 read_lock(&ipgre_lock);
749c10f9 577 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
e1a80002
HX
578 iph->saddr, iph->daddr, key,
579 gre_proto))) {
addd68eb
PE
580 struct net_device_stats *stats = &tunnel->dev->stats;
581
1da177e4
LT
582 secpath_reset(skb);
583
e1a80002 584 skb->protocol = gre_proto;
1da177e4
LT
585 /* WCCP version 1 and 2 protocol decoding.
586 * - Change protocol to IP
587 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
588 */
e1a80002 589 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
496c98df 590 skb->protocol = htons(ETH_P_IP);
e905a9ed 591 if ((*(h + offset) & 0xF0) != 0x40)
1da177e4
LT
592 offset += 4;
593 }
594
1d069167 595 skb->mac_header = skb->network_header;
4209fb60 596 __pskb_pull(skb, offset);
9c70220b 597 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
1da177e4
LT
598 skb->pkt_type = PACKET_HOST;
599#ifdef CONFIG_NET_IPGRE_BROADCAST
f97c1e0c 600 if (ipv4_is_multicast(iph->daddr)) {
1da177e4 601 /* Looped back packet, drop it! */
511c3f92 602 if (skb_rtable(skb)->fl.iif == 0)
1da177e4 603 goto drop;
addd68eb 604 stats->multicast++;
1da177e4
LT
605 skb->pkt_type = PACKET_BROADCAST;
606 }
607#endif
608
609 if (((flags&GRE_CSUM) && csum) ||
610 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
addd68eb
PE
611 stats->rx_crc_errors++;
612 stats->rx_errors++;
1da177e4
LT
613 goto drop;
614 }
615 if (tunnel->parms.i_flags&GRE_SEQ) {
616 if (!(flags&GRE_SEQ) ||
617 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
addd68eb
PE
618 stats->rx_fifo_errors++;
619 stats->rx_errors++;
1da177e4
LT
620 goto drop;
621 }
622 tunnel->i_seqno = seqno + 1;
623 }
e1a80002 624
64194c31
HX
625 len = skb->len;
626
e1a80002
HX
627 /* Warning: All skb pointers will be invalidated! */
628 if (tunnel->dev->type == ARPHRD_ETHER) {
629 if (!pskb_may_pull(skb, ETH_HLEN)) {
630 stats->rx_length_errors++;
631 stats->rx_errors++;
632 goto drop;
633 }
634
635 iph = ip_hdr(skb);
636 skb->protocol = eth_type_trans(skb, tunnel->dev);
637 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
638 }
639
addd68eb 640 stats->rx_packets++;
64194c31 641 stats->rx_bytes += len;
1da177e4 642 skb->dev = tunnel->dev;
adf30907 643 skb_dst_drop(skb);
1da177e4 644 nf_reset(skb);
e1a80002
HX
645
646 skb_reset_network_header(skb);
1da177e4 647 ipgre_ecn_decapsulate(iph, skb);
e1a80002 648
1da177e4
LT
649 netif_rx(skb);
650 read_unlock(&ipgre_lock);
651 return(0);
652 }
45af08be 653 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
1da177e4
LT
654
655drop:
656 read_unlock(&ipgre_lock);
657drop_nolock:
658 kfree_skb(skb);
659 return(0);
660}
661
6fef4c0c 662static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
1da177e4 663{
2941a486 664 struct ip_tunnel *tunnel = netdev_priv(dev);
0bfbedb1
ED
665 struct net_device_stats *stats = &dev->stats;
666 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
eddc9ec5 667 struct iphdr *old_iph = ip_hdr(skb);
1da177e4
LT
668 struct iphdr *tiph;
669 u8 tos;
d5a0a1e3 670 __be16 df;
1da177e4
LT
671 struct rtable *rt; /* Route to the other host */
672 struct net_device *tdev; /* Device to other host */
673 struct iphdr *iph; /* Our new IP header */
c2636b4d 674 unsigned int max_headroom; /* The extra header space needed */
1da177e4 675 int gre_hlen;
d5a0a1e3 676 __be32 dst;
1da177e4
LT
677 int mtu;
678
e1a80002
HX
679 if (dev->type == ARPHRD_ETHER)
680 IPCB(skb)->flags = 0;
681
682 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
1da177e4 683 gre_hlen = 0;
6ed2533e 684 tiph = (struct iphdr *)skb->data;
1da177e4
LT
685 } else {
686 gre_hlen = tunnel->hlen;
687 tiph = &tunnel->parms.iph;
688 }
689
690 if ((dst = tiph->daddr) == 0) {
691 /* NBMA tunnel */
692
adf30907 693 if (skb_dst(skb) == NULL) {
addd68eb 694 stats->tx_fifo_errors++;
1da177e4
LT
695 goto tx_error;
696 }
697
698 if (skb->protocol == htons(ETH_P_IP)) {
511c3f92 699 rt = skb_rtable(skb);
1da177e4
LT
700 if ((dst = rt->rt_gateway) == 0)
701 goto tx_error_icmp;
702 }
703#ifdef CONFIG_IPV6
704 else if (skb->protocol == htons(ETH_P_IPV6)) {
705 struct in6_addr *addr6;
706 int addr_type;
adf30907 707 struct neighbour *neigh = skb_dst(skb)->neighbour;
1da177e4
LT
708
709 if (neigh == NULL)
710 goto tx_error;
711
6ed2533e 712 addr6 = (struct in6_addr *)&neigh->primary_key;
1da177e4
LT
713 addr_type = ipv6_addr_type(addr6);
714
715 if (addr_type == IPV6_ADDR_ANY) {
0660e03f 716 addr6 = &ipv6_hdr(skb)->daddr;
1da177e4
LT
717 addr_type = ipv6_addr_type(addr6);
718 }
719
720 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
721 goto tx_error_icmp;
722
723 dst = addr6->s6_addr32[3];
724 }
725#endif
726 else
727 goto tx_error;
728 }
729
730 tos = tiph->tos;
ee686ca9
AJ
731 if (tos == 1) {
732 tos = 0;
1da177e4
LT
733 if (skb->protocol == htons(ETH_P_IP))
734 tos = old_iph->tos;
1da177e4
LT
735 }
736
737 {
738 struct flowi fl = { .oif = tunnel->parms.link,
739 .nl_u = { .ip4_u =
740 { .daddr = dst,
741 .saddr = tiph->saddr,
742 .tos = RT_TOS(tos) } },
743 .proto = IPPROTO_GRE };
96635522 744 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
addd68eb 745 stats->tx_carrier_errors++;
1da177e4
LT
746 goto tx_error;
747 }
748 }
749 tdev = rt->u.dst.dev;
750
751 if (tdev == dev) {
752 ip_rt_put(rt);
addd68eb 753 stats->collisions++;
1da177e4
LT
754 goto tx_error;
755 }
756
757 df = tiph->frag_off;
758 if (df)
c95b819a 759 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
1da177e4 760 else
adf30907 761 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
1da177e4 762
adf30907
ED
763 if (skb_dst(skb))
764 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
1da177e4
LT
765
766 if (skb->protocol == htons(ETH_P_IP)) {
767 df |= (old_iph->frag_off&htons(IP_DF));
768
769 if ((old_iph->frag_off&htons(IP_DF)) &&
770 mtu < ntohs(old_iph->tot_len)) {
771 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
772 ip_rt_put(rt);
773 goto tx_error;
774 }
775 }
776#ifdef CONFIG_IPV6
777 else if (skb->protocol == htons(ETH_P_IPV6)) {
adf30907 778 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
1da177e4 779
adf30907 780 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
f97c1e0c
JP
781 if ((tunnel->parms.iph.daddr &&
782 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
1da177e4
LT
783 rt6->rt6i_dst.plen == 128) {
784 rt6->rt6i_flags |= RTF_MODIFIED;
adf30907 785 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
1da177e4
LT
786 }
787 }
788
789 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
790 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
791 ip_rt_put(rt);
792 goto tx_error;
793 }
794 }
795#endif
796
797 if (tunnel->err_count > 0) {
da6185d8
WY
798 if (time_before(jiffies,
799 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
1da177e4
LT
800 tunnel->err_count--;
801
802 dst_link_failure(skb);
803 } else
804 tunnel->err_count = 0;
805 }
806
807 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
808
cfbba49d
PM
809 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
810 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
1da177e4
LT
811 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
812 if (!new_skb) {
813 ip_rt_put(rt);
0bfbedb1 814 txq->tx_dropped++;
1da177e4 815 dev_kfree_skb(skb);
6ed10654 816 return NETDEV_TX_OK;
1da177e4
LT
817 }
818 if (skb->sk)
819 skb_set_owner_w(new_skb, skb->sk);
820 dev_kfree_skb(skb);
821 skb = new_skb;
eddc9ec5 822 old_iph = ip_hdr(skb);
1da177e4
LT
823 }
824
64194c31 825 skb_reset_transport_header(skb);
e2d1bca7
ACM
826 skb_push(skb, gre_hlen);
827 skb_reset_network_header(skb);
1da177e4 828 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
48d5cad8
PM
829 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
830 IPSKB_REROUTED);
adf30907
ED
831 skb_dst_drop(skb);
832 skb_dst_set(skb, &rt->u.dst);
1da177e4
LT
833
834 /*
835 * Push down and install the IPIP header.
836 */
837
eddc9ec5 838 iph = ip_hdr(skb);
1da177e4
LT
839 iph->version = 4;
840 iph->ihl = sizeof(struct iphdr) >> 2;
841 iph->frag_off = df;
842 iph->protocol = IPPROTO_GRE;
843 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
844 iph->daddr = rt->rt_dst;
845 iph->saddr = rt->rt_src;
846
847 if ((iph->ttl = tiph->ttl) == 0) {
848 if (skb->protocol == htons(ETH_P_IP))
849 iph->ttl = old_iph->ttl;
850#ifdef CONFIG_IPV6
851 else if (skb->protocol == htons(ETH_P_IPV6))
6ed2533e 852 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
1da177e4
LT
853#endif
854 else
855 iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
856 }
857
e1a80002
HX
858 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
859 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
860 htons(ETH_P_TEB) : skb->protocol;
1da177e4
LT
861
862 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
d5a0a1e3 863 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
1da177e4
LT
864
865 if (tunnel->parms.o_flags&GRE_SEQ) {
866 ++tunnel->o_seqno;
867 *ptr = htonl(tunnel->o_seqno);
868 ptr--;
869 }
870 if (tunnel->parms.o_flags&GRE_KEY) {
871 *ptr = tunnel->parms.o_key;
872 ptr--;
873 }
874 if (tunnel->parms.o_flags&GRE_CSUM) {
875 *ptr = 0;
5f92a738 876 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
1da177e4
LT
877 }
878 }
879
880 nf_reset(skb);
881
882 IPTUNNEL_XMIT();
6ed10654 883 return NETDEV_TX_OK;
1da177e4
LT
884
885tx_error_icmp:
886 dst_link_failure(skb);
887
888tx_error:
889 stats->tx_errors++;
890 dev_kfree_skb(skb);
6ed10654 891 return NETDEV_TX_OK;
1da177e4
LT
892}
893
42aa9162 894static int ipgre_tunnel_bind_dev(struct net_device *dev)
ee34c1eb
MS
895{
896 struct net_device *tdev = NULL;
897 struct ip_tunnel *tunnel;
898 struct iphdr *iph;
899 int hlen = LL_MAX_HEADER;
900 int mtu = ETH_DATA_LEN;
901 int addend = sizeof(struct iphdr) + 4;
902
903 tunnel = netdev_priv(dev);
904 iph = &tunnel->parms.iph;
905
c95b819a 906 /* Guess output device to choose reasonable mtu and needed_headroom */
ee34c1eb
MS
907
908 if (iph->daddr) {
909 struct flowi fl = { .oif = tunnel->parms.link,
910 .nl_u = { .ip4_u =
911 { .daddr = iph->daddr,
912 .saddr = iph->saddr,
913 .tos = RT_TOS(iph->tos) } },
914 .proto = IPPROTO_GRE };
915 struct rtable *rt;
96635522 916 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
ee34c1eb
MS
917 tdev = rt->u.dst.dev;
918 ip_rt_put(rt);
919 }
e1a80002
HX
920
921 if (dev->type != ARPHRD_ETHER)
922 dev->flags |= IFF_POINTOPOINT;
ee34c1eb
MS
923 }
924
925 if (!tdev && tunnel->parms.link)
96635522 926 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
ee34c1eb
MS
927
928 if (tdev) {
c95b819a 929 hlen = tdev->hard_header_len + tdev->needed_headroom;
ee34c1eb
MS
930 mtu = tdev->mtu;
931 }
932 dev->iflink = tunnel->parms.link;
933
934 /* Precalculate GRE options length */
935 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
936 if (tunnel->parms.o_flags&GRE_CSUM)
937 addend += 4;
938 if (tunnel->parms.o_flags&GRE_KEY)
939 addend += 4;
940 if (tunnel->parms.o_flags&GRE_SEQ)
941 addend += 4;
942 }
c95b819a 943 dev->needed_headroom = addend + hlen;
8cdb0456 944 mtu -= dev->hard_header_len + addend;
42aa9162
HX
945
946 if (mtu < 68)
947 mtu = 68;
948
ee34c1eb
MS
949 tunnel->hlen = addend;
950
42aa9162 951 return mtu;
ee34c1eb
MS
952}
953
1da177e4
LT
954static int
955ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
956{
957 int err = 0;
958 struct ip_tunnel_parm p;
959 struct ip_tunnel *t;
f57e7d5a
PE
960 struct net *net = dev_net(dev);
961 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1da177e4
LT
962
963 switch (cmd) {
964 case SIOCGETTUNNEL:
965 t = NULL;
7daa0004 966 if (dev == ign->fb_tunnel_dev) {
1da177e4
LT
967 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
968 err = -EFAULT;
969 break;
970 }
f57e7d5a 971 t = ipgre_tunnel_locate(net, &p, 0);
1da177e4
LT
972 }
973 if (t == NULL)
2941a486 974 t = netdev_priv(dev);
1da177e4
LT
975 memcpy(&p, &t->parms, sizeof(p));
976 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
977 err = -EFAULT;
978 break;
979
980 case SIOCADDTUNNEL:
981 case SIOCCHGTUNNEL:
982 err = -EPERM;
983 if (!capable(CAP_NET_ADMIN))
984 goto done;
985
986 err = -EFAULT;
987 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
988 goto done;
989
990 err = -EINVAL;
991 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
992 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
993 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
994 goto done;
995 if (p.iph.ttl)
996 p.iph.frag_off |= htons(IP_DF);
997
998 if (!(p.i_flags&GRE_KEY))
999 p.i_key = 0;
1000 if (!(p.o_flags&GRE_KEY))
1001 p.o_key = 0;
1002
f57e7d5a 1003 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1da177e4 1004
7daa0004 1005 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1da177e4
LT
1006 if (t != NULL) {
1007 if (t->dev != dev) {
1008 err = -EEXIST;
1009 break;
1010 }
1011 } else {
6ed2533e 1012 unsigned nflags = 0;
1da177e4 1013
2941a486 1014 t = netdev_priv(dev);
1da177e4 1015
f97c1e0c 1016 if (ipv4_is_multicast(p.iph.daddr))
1da177e4
LT
1017 nflags = IFF_BROADCAST;
1018 else if (p.iph.daddr)
1019 nflags = IFF_POINTOPOINT;
1020
1021 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1022 err = -EINVAL;
1023 break;
1024 }
f57e7d5a 1025 ipgre_tunnel_unlink(ign, t);
1da177e4
LT
1026 t->parms.iph.saddr = p.iph.saddr;
1027 t->parms.iph.daddr = p.iph.daddr;
1028 t->parms.i_key = p.i_key;
1029 t->parms.o_key = p.o_key;
1030 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1031 memcpy(dev->broadcast, &p.iph.daddr, 4);
f57e7d5a 1032 ipgre_tunnel_link(ign, t);
1da177e4
LT
1033 netdev_state_change(dev);
1034 }
1035 }
1036
1037 if (t) {
1038 err = 0;
1039 if (cmd == SIOCCHGTUNNEL) {
1040 t->parms.iph.ttl = p.iph.ttl;
1041 t->parms.iph.tos = p.iph.tos;
1042 t->parms.iph.frag_off = p.iph.frag_off;
ee34c1eb
MS
1043 if (t->parms.link != p.link) {
1044 t->parms.link = p.link;
42aa9162 1045 dev->mtu = ipgre_tunnel_bind_dev(dev);
ee34c1eb
MS
1046 netdev_state_change(dev);
1047 }
1da177e4
LT
1048 }
1049 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1050 err = -EFAULT;
1051 } else
1052 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1053 break;
1054
1055 case SIOCDELTUNNEL:
1056 err = -EPERM;
1057 if (!capable(CAP_NET_ADMIN))
1058 goto done;
1059
7daa0004 1060 if (dev == ign->fb_tunnel_dev) {
1da177e4
LT
1061 err = -EFAULT;
1062 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1063 goto done;
1064 err = -ENOENT;
f57e7d5a 1065 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1da177e4
LT
1066 goto done;
1067 err = -EPERM;
7daa0004 1068 if (t == netdev_priv(ign->fb_tunnel_dev))
1da177e4
LT
1069 goto done;
1070 dev = t->dev;
1071 }
22f8cde5
SH
1072 unregister_netdevice(dev);
1073 err = 0;
1da177e4
LT
1074 break;
1075
1076 default:
1077 err = -EINVAL;
1078 }
1079
1080done:
1081 return err;
1082}
1083
1da177e4
LT
1084static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1085{
2941a486 1086 struct ip_tunnel *tunnel = netdev_priv(dev);
c95b819a
HX
1087 if (new_mtu < 68 ||
1088 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1da177e4
LT
1089 return -EINVAL;
1090 dev->mtu = new_mtu;
1091 return 0;
1092}
1093
1da177e4
LT
1094/* Nice toy. Unfortunately, useless in real life :-)
1095 It allows to construct virtual multiprotocol broadcast "LAN"
1096 over the Internet, provided multicast routing is tuned.
1097
1098
1099 I have no idea was this bicycle invented before me,
1100 so that I had to set ARPHRD_IPGRE to a random value.
1101 I have an impression, that Cisco could make something similar,
1102 but this feature is apparently missing in IOS<=11.2(8).
e905a9ed 1103
1da177e4
LT
1104 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1105 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1106
1107 ping -t 255 224.66.66.66
1108
1109 If nobody answers, mbone does not work.
1110
1111 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1112 ip addr add 10.66.66.<somewhat>/24 dev Universe
1113 ifconfig Universe up
1114 ifconfig Universe add fe80::<Your_real_addr>/10
1115 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1116 ftp 10.66.66.66
1117 ...
1118 ftp fec0:6666:6666::193.233.7.65
1119 ...
1120
1121 */
1122
3b04ddde
SH
1123static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1124 unsigned short type,
1125 const void *daddr, const void *saddr, unsigned len)
1da177e4 1126{
2941a486 1127 struct ip_tunnel *t = netdev_priv(dev);
1da177e4 1128 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
d5a0a1e3 1129 __be16 *p = (__be16*)(iph+1);
1da177e4
LT
1130
1131 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1132 p[0] = t->parms.o_flags;
1133 p[1] = htons(type);
1134
1135 /*
e905a9ed 1136 * Set the source hardware address.
1da177e4 1137 */
e905a9ed 1138
1da177e4
LT
1139 if (saddr)
1140 memcpy(&iph->saddr, saddr, 4);
1141
1142 if (daddr) {
1143 memcpy(&iph->daddr, daddr, 4);
1144 return t->hlen;
1145 }
f97c1e0c 1146 if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1da177e4 1147 return t->hlen;
e905a9ed 1148
1da177e4
LT
1149 return -t->hlen;
1150}
1151
6a5f44d7
TT
1152static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1153{
6ed2533e 1154 struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
6a5f44d7
TT
1155 memcpy(haddr, &iph->saddr, 4);
1156 return 4;
1157}
1158
3b04ddde
SH
1159static const struct header_ops ipgre_header_ops = {
1160 .create = ipgre_header,
6a5f44d7 1161 .parse = ipgre_header_parse,
3b04ddde
SH
1162};
1163
6a5f44d7 1164#ifdef CONFIG_NET_IPGRE_BROADCAST
1da177e4
LT
1165static int ipgre_open(struct net_device *dev)
1166{
2941a486 1167 struct ip_tunnel *t = netdev_priv(dev);
1da177e4 1168
f97c1e0c 1169 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1da177e4
LT
1170 struct flowi fl = { .oif = t->parms.link,
1171 .nl_u = { .ip4_u =
1172 { .daddr = t->parms.iph.daddr,
1173 .saddr = t->parms.iph.saddr,
1174 .tos = RT_TOS(t->parms.iph.tos) } },
1175 .proto = IPPROTO_GRE };
1176 struct rtable *rt;
96635522 1177 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1da177e4
LT
1178 return -EADDRNOTAVAIL;
1179 dev = rt->u.dst.dev;
1180 ip_rt_put(rt);
e5ed6399 1181 if (__in_dev_get_rtnl(dev) == NULL)
1da177e4
LT
1182 return -EADDRNOTAVAIL;
1183 t->mlink = dev->ifindex;
e5ed6399 1184 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1da177e4
LT
1185 }
1186 return 0;
1187}
1188
1189static int ipgre_close(struct net_device *dev)
1190{
2941a486 1191 struct ip_tunnel *t = netdev_priv(dev);
b8c26a33 1192
f97c1e0c 1193 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
7fee0ca2 1194 struct in_device *in_dev;
c346dca1 1195 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1da177e4
LT
1196 if (in_dev) {
1197 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1198 in_dev_put(in_dev);
1199 }
1200 }
1201 return 0;
1202}
1203
1204#endif
1205
b8c26a33
SH
1206static const struct net_device_ops ipgre_netdev_ops = {
1207 .ndo_init = ipgre_tunnel_init,
1208 .ndo_uninit = ipgre_tunnel_uninit,
1209#ifdef CONFIG_NET_IPGRE_BROADCAST
1210 .ndo_open = ipgre_open,
1211 .ndo_stop = ipgre_close,
1212#endif
1213 .ndo_start_xmit = ipgre_tunnel_xmit,
1214 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1215 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1216};
1217
1da177e4
LT
1218static void ipgre_tunnel_setup(struct net_device *dev)
1219{
b8c26a33 1220 dev->netdev_ops = &ipgre_netdev_ops;
1da177e4 1221 dev->destructor = free_netdev;
1da177e4
LT
1222
1223 dev->type = ARPHRD_IPGRE;
c95b819a 1224 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
46f25dff 1225 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1da177e4
LT
1226 dev->flags = IFF_NOARP;
1227 dev->iflink = 0;
1228 dev->addr_len = 4;
0b67eceb 1229 dev->features |= NETIF_F_NETNS_LOCAL;
108bfa89 1230 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1da177e4
LT
1231}
1232
1233static int ipgre_tunnel_init(struct net_device *dev)
1234{
1da177e4
LT
1235 struct ip_tunnel *tunnel;
1236 struct iphdr *iph;
1da177e4 1237
2941a486 1238 tunnel = netdev_priv(dev);
1da177e4
LT
1239 iph = &tunnel->parms.iph;
1240
1241 tunnel->dev = dev;
1242 strcpy(tunnel->parms.name, dev->name);
1243
1244 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1245 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1246
1da177e4 1247 if (iph->daddr) {
1da177e4 1248#ifdef CONFIG_NET_IPGRE_BROADCAST
f97c1e0c 1249 if (ipv4_is_multicast(iph->daddr)) {
1da177e4
LT
1250 if (!iph->saddr)
1251 return -EINVAL;
1252 dev->flags = IFF_BROADCAST;
3b04ddde 1253 dev->header_ops = &ipgre_header_ops;
1da177e4
LT
1254 }
1255#endif
ee34c1eb 1256 } else
6a5f44d7 1257 dev->header_ops = &ipgre_header_ops;
1da177e4 1258
1da177e4
LT
1259 return 0;
1260}
1261
b8c26a33 1262static void ipgre_fb_tunnel_init(struct net_device *dev)
1da177e4 1263{
2941a486 1264 struct ip_tunnel *tunnel = netdev_priv(dev);
1da177e4 1265 struct iphdr *iph = &tunnel->parms.iph;
eb8ce741 1266 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1da177e4
LT
1267
1268 tunnel->dev = dev;
1269 strcpy(tunnel->parms.name, dev->name);
1270
1271 iph->version = 4;
1272 iph->protocol = IPPROTO_GRE;
1273 iph->ihl = 5;
1274 tunnel->hlen = sizeof(struct iphdr) + 4;
1275
1276 dev_hold(dev);
eb8ce741 1277 ign->tunnels_wc[0] = tunnel;
1da177e4
LT
1278}
1279
1280
32613090 1281static const struct net_protocol ipgre_protocol = {
1da177e4
LT
1282 .handler = ipgre_rcv,
1283 .err_handler = ipgre_err,
f96c148f 1284 .netns_ok = 1,
1da177e4
LT
1285};
1286
eb8ce741
PE
1287static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1288{
1289 int prio;
1290
1291 for (prio = 0; prio < 4; prio++) {
1292 int h;
1293 for (h = 0; h < HASH_SIZE; h++) {
1294 struct ip_tunnel *t;
1295 while ((t = ign->tunnels[prio][h]) != NULL)
1296 unregister_netdevice(t->dev);
1297 }
1298 }
1299}
1300
59a4c759
PE
1301static int ipgre_init_net(struct net *net)
1302{
1303 int err;
1304 struct ipgre_net *ign;
1305
1306 err = -ENOMEM;
eb8ce741 1307 ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
59a4c759
PE
1308 if (ign == NULL)
1309 goto err_alloc;
1310
1311 err = net_assign_generic(net, ipgre_net_id, ign);
1312 if (err < 0)
1313 goto err_assign;
1314
7daa0004
PE
1315 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1316 ipgre_tunnel_setup);
1317 if (!ign->fb_tunnel_dev) {
1318 err = -ENOMEM;
1319 goto err_alloc_dev;
1320 }
be77e593 1321 dev_net_set(ign->fb_tunnel_dev, net);
7daa0004 1322
b8c26a33 1323 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
c19e654d 1324 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
7daa0004
PE
1325
1326 if ((err = register_netdev(ign->fb_tunnel_dev)))
1327 goto err_reg_dev;
1328
59a4c759
PE
1329 return 0;
1330
7daa0004
PE
1331err_reg_dev:
1332 free_netdev(ign->fb_tunnel_dev);
1333err_alloc_dev:
1334 /* nothing */
59a4c759
PE
1335err_assign:
1336 kfree(ign);
1337err_alloc:
1338 return err;
1339}
1340
1341static void ipgre_exit_net(struct net *net)
1342{
1343 struct ipgre_net *ign;
1344
1345 ign = net_generic(net, ipgre_net_id);
7daa0004 1346 rtnl_lock();
eb8ce741 1347 ipgre_destroy_tunnels(ign);
7daa0004 1348 rtnl_unlock();
59a4c759
PE
1349 kfree(ign);
1350}
1351
1352static struct pernet_operations ipgre_net_ops = {
1353 .init = ipgre_init_net,
1354 .exit = ipgre_exit_net,
1355};
1da177e4 1356
c19e654d
HX
1357static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1358{
1359 __be16 flags;
1360
1361 if (!data)
1362 return 0;
1363
1364 flags = 0;
1365 if (data[IFLA_GRE_IFLAGS])
1366 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1367 if (data[IFLA_GRE_OFLAGS])
1368 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1369 if (flags & (GRE_VERSION|GRE_ROUTING))
1370 return -EINVAL;
1371
1372 return 0;
1373}
1374
e1a80002
HX
1375static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1376{
1377 __be32 daddr;
1378
1379 if (tb[IFLA_ADDRESS]) {
1380 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1381 return -EINVAL;
1382 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1383 return -EADDRNOTAVAIL;
1384 }
1385
1386 if (!data)
1387 goto out;
1388
1389 if (data[IFLA_GRE_REMOTE]) {
1390 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1391 if (!daddr)
1392 return -EINVAL;
1393 }
1394
1395out:
1396 return ipgre_tunnel_validate(tb, data);
1397}
1398
c19e654d
HX
1399static void ipgre_netlink_parms(struct nlattr *data[],
1400 struct ip_tunnel_parm *parms)
1401{
7bb82d92 1402 memset(parms, 0, sizeof(*parms));
c19e654d
HX
1403
1404 parms->iph.protocol = IPPROTO_GRE;
1405
1406 if (!data)
1407 return;
1408
1409 if (data[IFLA_GRE_LINK])
1410 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1411
1412 if (data[IFLA_GRE_IFLAGS])
1413 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1414
1415 if (data[IFLA_GRE_OFLAGS])
1416 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1417
1418 if (data[IFLA_GRE_IKEY])
1419 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1420
1421 if (data[IFLA_GRE_OKEY])
1422 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1423
1424 if (data[IFLA_GRE_LOCAL])
4d74f8ba 1425 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
c19e654d
HX
1426
1427 if (data[IFLA_GRE_REMOTE])
4d74f8ba 1428 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
c19e654d
HX
1429
1430 if (data[IFLA_GRE_TTL])
1431 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1432
1433 if (data[IFLA_GRE_TOS])
1434 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1435
1436 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1437 parms->iph.frag_off = htons(IP_DF);
1438}
1439
e1a80002
HX
1440static int ipgre_tap_init(struct net_device *dev)
1441{
1442 struct ip_tunnel *tunnel;
1443
1444 tunnel = netdev_priv(dev);
1445
1446 tunnel->dev = dev;
1447 strcpy(tunnel->parms.name, dev->name);
1448
1449 ipgre_tunnel_bind_dev(dev);
1450
1451 return 0;
1452}
1453
b8c26a33
SH
1454static const struct net_device_ops ipgre_tap_netdev_ops = {
1455 .ndo_init = ipgre_tap_init,
1456 .ndo_uninit = ipgre_tunnel_uninit,
1457 .ndo_start_xmit = ipgre_tunnel_xmit,
1458 .ndo_set_mac_address = eth_mac_addr,
1459 .ndo_validate_addr = eth_validate_addr,
1460 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1461};
1462
e1a80002
HX
1463static void ipgre_tap_setup(struct net_device *dev)
1464{
1465
1466 ether_setup(dev);
1467
b8c26a33 1468 dev->netdev_ops = &ipgre_netdev_ops;
e1a80002 1469 dev->destructor = free_netdev;
e1a80002
HX
1470
1471 dev->iflink = 0;
1472 dev->features |= NETIF_F_NETNS_LOCAL;
1473}
1474
c19e654d
HX
1475static int ipgre_newlink(struct net_device *dev, struct nlattr *tb[],
1476 struct nlattr *data[])
1477{
1478 struct ip_tunnel *nt;
1479 struct net *net = dev_net(dev);
1480 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1481 int mtu;
1482 int err;
1483
1484 nt = netdev_priv(dev);
1485 ipgre_netlink_parms(data, &nt->parms);
1486
e1a80002 1487 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
c19e654d
HX
1488 return -EEXIST;
1489
e1a80002
HX
1490 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1491 random_ether_addr(dev->dev_addr);
1492
c19e654d
HX
1493 mtu = ipgre_tunnel_bind_dev(dev);
1494 if (!tb[IFLA_MTU])
1495 dev->mtu = mtu;
1496
1497 err = register_netdevice(dev);
1498 if (err)
1499 goto out;
1500
1501 dev_hold(dev);
1502 ipgre_tunnel_link(ign, nt);
1503
1504out:
1505 return err;
1506}
1507
1508static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1509 struct nlattr *data[])
1510{
1511 struct ip_tunnel *t, *nt;
1512 struct net *net = dev_net(dev);
1513 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1514 struct ip_tunnel_parm p;
1515 int mtu;
1516
1517 if (dev == ign->fb_tunnel_dev)
1518 return -EINVAL;
1519
1520 nt = netdev_priv(dev);
1521 ipgre_netlink_parms(data, &p);
1522
1523 t = ipgre_tunnel_locate(net, &p, 0);
1524
1525 if (t) {
1526 if (t->dev != dev)
1527 return -EEXIST;
1528 } else {
1529 unsigned nflags = 0;
1530
1531 t = nt;
1532
1533 if (ipv4_is_multicast(p.iph.daddr))
1534 nflags = IFF_BROADCAST;
1535 else if (p.iph.daddr)
1536 nflags = IFF_POINTOPOINT;
1537
1538 if ((dev->flags ^ nflags) &
1539 (IFF_POINTOPOINT | IFF_BROADCAST))
1540 return -EINVAL;
1541
1542 ipgre_tunnel_unlink(ign, t);
1543 t->parms.iph.saddr = p.iph.saddr;
1544 t->parms.iph.daddr = p.iph.daddr;
1545 t->parms.i_key = p.i_key;
1546 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1547 memcpy(dev->broadcast, &p.iph.daddr, 4);
1548 ipgre_tunnel_link(ign, t);
1549 netdev_state_change(dev);
1550 }
1551
1552 t->parms.o_key = p.o_key;
1553 t->parms.iph.ttl = p.iph.ttl;
1554 t->parms.iph.tos = p.iph.tos;
1555 t->parms.iph.frag_off = p.iph.frag_off;
1556
1557 if (t->parms.link != p.link) {
1558 t->parms.link = p.link;
1559 mtu = ipgre_tunnel_bind_dev(dev);
1560 if (!tb[IFLA_MTU])
1561 dev->mtu = mtu;
1562 netdev_state_change(dev);
1563 }
1564
1565 return 0;
1566}
1567
1568static size_t ipgre_get_size(const struct net_device *dev)
1569{
1570 return
1571 /* IFLA_GRE_LINK */
1572 nla_total_size(4) +
1573 /* IFLA_GRE_IFLAGS */
1574 nla_total_size(2) +
1575 /* IFLA_GRE_OFLAGS */
1576 nla_total_size(2) +
1577 /* IFLA_GRE_IKEY */
1578 nla_total_size(4) +
1579 /* IFLA_GRE_OKEY */
1580 nla_total_size(4) +
1581 /* IFLA_GRE_LOCAL */
1582 nla_total_size(4) +
1583 /* IFLA_GRE_REMOTE */
1584 nla_total_size(4) +
1585 /* IFLA_GRE_TTL */
1586 nla_total_size(1) +
1587 /* IFLA_GRE_TOS */
1588 nla_total_size(1) +
1589 /* IFLA_GRE_PMTUDISC */
1590 nla_total_size(1) +
1591 0;
1592}
1593
1594static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1595{
1596 struct ip_tunnel *t = netdev_priv(dev);
1597 struct ip_tunnel_parm *p = &t->parms;
1598
1599 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1600 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1601 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
ba9e64b1
PM
1602 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1603 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
4d74f8ba
PM
1604 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1605 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
c19e654d
HX
1606 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1607 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1608 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1609
1610 return 0;
1611
1612nla_put_failure:
1613 return -EMSGSIZE;
1614}
1615
1616static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1617 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1618 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1619 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1620 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1621 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
4d74f8ba
PM
1622 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1623 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
c19e654d
HX
1624 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1625 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1626 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1627};
1628
1629static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1630 .kind = "gre",
1631 .maxtype = IFLA_GRE_MAX,
1632 .policy = ipgre_policy,
1633 .priv_size = sizeof(struct ip_tunnel),
1634 .setup = ipgre_tunnel_setup,
1635 .validate = ipgre_tunnel_validate,
1636 .newlink = ipgre_newlink,
1637 .changelink = ipgre_changelink,
1638 .get_size = ipgre_get_size,
1639 .fill_info = ipgre_fill_info,
1640};
1641
e1a80002
HX
1642static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1643 .kind = "gretap",
1644 .maxtype = IFLA_GRE_MAX,
1645 .policy = ipgre_policy,
1646 .priv_size = sizeof(struct ip_tunnel),
1647 .setup = ipgre_tap_setup,
1648 .validate = ipgre_tap_validate,
1649 .newlink = ipgre_newlink,
1650 .changelink = ipgre_changelink,
1651 .get_size = ipgre_get_size,
1652 .fill_info = ipgre_fill_info,
1653};
1654
1da177e4
LT
1655/*
1656 * And now the modules code and kernel interface.
1657 */
1658
1659static int __init ipgre_init(void)
1660{
1661 int err;
1662
1663 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1664
1665 if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1666 printk(KERN_INFO "ipgre init: can't add protocol\n");
1667 return -EAGAIN;
1668 }
1669
59a4c759
PE
1670 err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1671 if (err < 0)
c19e654d 1672 goto gen_device_failed;
7daa0004 1673
c19e654d
HX
1674 err = rtnl_link_register(&ipgre_link_ops);
1675 if (err < 0)
1676 goto rtnl_link_failed;
1677
e1a80002
HX
1678 err = rtnl_link_register(&ipgre_tap_ops);
1679 if (err < 0)
1680 goto tap_ops_failed;
1681
c19e654d 1682out:
1da177e4 1683 return err;
c19e654d 1684
e1a80002
HX
1685tap_ops_failed:
1686 rtnl_link_unregister(&ipgre_link_ops);
c19e654d
HX
1687rtnl_link_failed:
1688 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1689gen_device_failed:
1690 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1691 goto out;
1da177e4
LT
1692}
1693
db44575f 1694static void __exit ipgre_fini(void)
1da177e4 1695{
e1a80002 1696 rtnl_link_unregister(&ipgre_tap_ops);
c19e654d
HX
1697 rtnl_link_unregister(&ipgre_link_ops);
1698 unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1da177e4
LT
1699 if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1700 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1da177e4
LT
1701}
1702
1703module_init(ipgre_init);
1704module_exit(ipgre_fini);
1705MODULE_LICENSE("GPL");
4d74f8ba
PM
1706MODULE_ALIAS_RTNL_LINK("gre");
1707MODULE_ALIAS_RTNL_LINK("gretap");