]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/ip_gre.c
tunnels: prepare percpu accounting
[net-next-2.6.git] / net / ipv4 / ip_gre.c
CommitLineData
1da177e4 1/*
e905a9ed 2 * Linux NET3: GRE over IP protocol decoder.
1da177e4
LT
3 *
4 * Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 */
12
4fc268d2 13#include <linux/capability.h>
1da177e4
LT
14#include <linux/module.h>
15#include <linux/types.h>
1da177e4 16#include <linux/kernel.h>
5a0e3ad6 17#include <linux/slab.h>
1da177e4
LT
18#include <asm/uaccess.h>
19#include <linux/skbuff.h>
20#include <linux/netdevice.h>
21#include <linux/in.h>
22#include <linux/tcp.h>
23#include <linux/udp.h>
24#include <linux/if_arp.h>
25#include <linux/mroute.h>
26#include <linux/init.h>
27#include <linux/in6.h>
28#include <linux/inetdevice.h>
29#include <linux/igmp.h>
30#include <linux/netfilter_ipv4.h>
e1a80002 31#include <linux/etherdevice.h>
46f25dff 32#include <linux/if_ether.h>
1da177e4
LT
33
34#include <net/sock.h>
35#include <net/ip.h>
36#include <net/icmp.h>
37#include <net/protocol.h>
38#include <net/ipip.h>
39#include <net/arp.h>
40#include <net/checksum.h>
41#include <net/dsfield.h>
42#include <net/inet_ecn.h>
43#include <net/xfrm.h>
59a4c759
PE
44#include <net/net_namespace.h>
45#include <net/netns/generic.h>
c19e654d 46#include <net/rtnetlink.h>
00959ade 47#include <net/gre.h>
1da177e4 48
842c74bf 49#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1da177e4
LT
50#include <net/ipv6.h>
51#include <net/ip6_fib.h>
52#include <net/ip6_route.h>
53#endif
54
55/*
56 Problems & solutions
57 --------------------
58
59 1. The most important issue is detecting local dead loops.
60 They would cause complete host lockup in transmit, which
61 would be "resolved" by stack overflow or, if queueing is enabled,
62 with infinite looping in net_bh.
63
64 We cannot track such dead loops during route installation,
65 it is infeasible task. The most general solutions would be
66 to keep skb->encapsulation counter (sort of local ttl),
67 and silently drop packet when it expires. It is the best
68 solution, but it supposes maintaing new variable in ALL
69 skb, even if no tunneling is used.
70
a43912ab 71 Current solution: HARD_TX_LOCK lock breaks dead loops.
1da177e4
LT
72
73
74
75 2. Networking dead loops would not kill routers, but would really
76 kill network. IP hop limit plays role of "t->recursion" in this case,
77 if we copy it from packet being encapsulated to upper header.
78 It is very good solution, but it introduces two problems:
79
80 - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81 do not work over tunnels.
82 - traceroute does not work. I planned to relay ICMP from tunnel,
83 so that this problem would be solved and traceroute output
84 would even more informative. This idea appeared to be wrong:
85 only Linux complies to rfc1812 now (yes, guys, Linux is the only
86 true router now :-)), all routers (at least, in neighbourhood of mine)
87 return only 8 bytes of payload. It is the end.
88
89 Hence, if we want that OSPF worked or traceroute said something reasonable,
90 we should search for another solution.
91
92 One of them is to parse packet trying to detect inner encapsulation
93 made by our node. It is difficult or even impossible, especially,
94 taking into account fragmentation. TO be short, tt is not solution at all.
95
96 Current solution: The solution was UNEXPECTEDLY SIMPLE.
97 We force DF flag on tunnels with preconfigured hop limit,
98 that is ALL. :-) Well, it does not remove the problem completely,
99 but exponential growth of network traffic is changed to linear
100 (branches, that exceed pmtu are pruned) and tunnel mtu
101 fastly degrades to value <68, where looping stops.
102 Yes, it is not good if there exists a router in the loop,
103 which does not force DF, even when encapsulating packets have DF set.
104 But it is not our problem! Nobody could accuse us, we made
105 all that we could make. Even if it is your gated who injected
106 fatal route to network, even if it were you who configured
107 fatal static route: you are innocent. :-)
108
109
110
111 3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112 practically identical code. It would be good to glue them
113 together, but it is not very evident, how to make them modular.
114 sit is integral part of IPv6, ipip and gre are naturally modular.
115 We could extract common parts (hash table, ioctl etc)
116 to a separate module (ip_tunnel.c).
117
118 Alexey Kuznetsov.
119 */
120
c19e654d 121static struct rtnl_link_ops ipgre_link_ops __read_mostly;
1da177e4
LT
122static int ipgre_tunnel_init(struct net_device *dev);
123static void ipgre_tunnel_setup(struct net_device *dev);
42aa9162 124static int ipgre_tunnel_bind_dev(struct net_device *dev);
1da177e4
LT
125
126/* Fallback tunnel: no source, no destination, no key, no options */
127
eb8ce741
PE
128#define HASH_SIZE 16
129
f99189b1 130static int ipgre_net_id __read_mostly;
59a4c759 131struct ipgre_net {
1507850b 132 struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
eb8ce741 133
7daa0004 134 struct net_device *fb_tunnel_dev;
59a4c759
PE
135};
136
1da177e4
LT
137/* Tunnel hash table */
138
139/*
140 4 hash tables:
141
142 3: (remote,local)
143 2: (remote,*)
144 1: (*,local)
145 0: (*,*)
146
147 We require exact key match i.e. if a key is present in packet
148 it will match only tunnel with the same key; if it is not present,
149 it will match only keyless tunnel.
150
151 All keysless packets, if not matched configured keyless tunnels
152 will match fallback tunnel.
153 */
154
d5a0a1e3 155#define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
1da177e4 156
eb8ce741
PE
157#define tunnels_r_l tunnels[3]
158#define tunnels_r tunnels[2]
159#define tunnels_l tunnels[1]
160#define tunnels_wc tunnels[0]
8d5b2c08 161/*
1507850b 162 * Locking : hash tables are protected by RCU and RTNL
8d5b2c08 163 */
1da177e4 164
8d5b2c08
ED
165#define for_each_ip_tunnel_rcu(start) \
166 for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
1da177e4
LT
167
168/* Given src, dst and key, find appropriate for input tunnel. */
169
749c10f9 170static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
e1a80002
HX
171 __be32 remote, __be32 local,
172 __be32 key, __be16 gre_proto)
1da177e4 173{
749c10f9
TT
174 struct net *net = dev_net(dev);
175 int link = dev->ifindex;
1507850b
ED
176 unsigned int h0 = HASH(remote);
177 unsigned int h1 = HASH(key);
afcf1242 178 struct ip_tunnel *t, *cand = NULL;
7daa0004 179 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
e1a80002
HX
180 int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
181 ARPHRD_ETHER : ARPHRD_IPGRE;
afcf1242 182 int score, cand_score = 4;
1da177e4 183
8d5b2c08 184 for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
749c10f9
TT
185 if (local != t->parms.iph.saddr ||
186 remote != t->parms.iph.daddr ||
187 key != t->parms.i_key ||
188 !(t->dev->flags & IFF_UP))
189 continue;
190
191 if (t->dev->type != ARPHRD_IPGRE &&
192 t->dev->type != dev_type)
193 continue;
194
afcf1242 195 score = 0;
749c10f9 196 if (t->parms.link != link)
afcf1242 197 score |= 1;
749c10f9 198 if (t->dev->type != dev_type)
afcf1242
TT
199 score |= 2;
200 if (score == 0)
749c10f9 201 return t;
afcf1242
TT
202
203 if (score < cand_score) {
204 cand = t;
205 cand_score = score;
206 }
1da177e4 207 }
e1a80002 208
8d5b2c08 209 for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
749c10f9
TT
210 if (remote != t->parms.iph.daddr ||
211 key != t->parms.i_key ||
212 !(t->dev->flags & IFF_UP))
213 continue;
214
215 if (t->dev->type != ARPHRD_IPGRE &&
216 t->dev->type != dev_type)
217 continue;
218
afcf1242 219 score = 0;
749c10f9 220 if (t->parms.link != link)
afcf1242 221 score |= 1;
749c10f9 222 if (t->dev->type != dev_type)
afcf1242
TT
223 score |= 2;
224 if (score == 0)
749c10f9 225 return t;
afcf1242
TT
226
227 if (score < cand_score) {
228 cand = t;
229 cand_score = score;
230 }
1da177e4 231 }
e1a80002 232
8d5b2c08 233 for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
749c10f9
TT
234 if ((local != t->parms.iph.saddr &&
235 (local != t->parms.iph.daddr ||
236 !ipv4_is_multicast(local))) ||
237 key != t->parms.i_key ||
238 !(t->dev->flags & IFF_UP))
239 continue;
240
241 if (t->dev->type != ARPHRD_IPGRE &&
242 t->dev->type != dev_type)
243 continue;
244
afcf1242 245 score = 0;
749c10f9 246 if (t->parms.link != link)
afcf1242 247 score |= 1;
749c10f9 248 if (t->dev->type != dev_type)
afcf1242
TT
249 score |= 2;
250 if (score == 0)
749c10f9 251 return t;
afcf1242
TT
252
253 if (score < cand_score) {
254 cand = t;
255 cand_score = score;
256 }
1da177e4 257 }
e1a80002 258
8d5b2c08 259 for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
749c10f9
TT
260 if (t->parms.i_key != key ||
261 !(t->dev->flags & IFF_UP))
262 continue;
263
264 if (t->dev->type != ARPHRD_IPGRE &&
265 t->dev->type != dev_type)
266 continue;
267
afcf1242 268 score = 0;
749c10f9 269 if (t->parms.link != link)
afcf1242 270 score |= 1;
749c10f9 271 if (t->dev->type != dev_type)
afcf1242
TT
272 score |= 2;
273 if (score == 0)
749c10f9 274 return t;
afcf1242
TT
275
276 if (score < cand_score) {
277 cand = t;
278 cand_score = score;
279 }
1da177e4
LT
280 }
281
afcf1242
TT
282 if (cand != NULL)
283 return cand;
e1a80002 284
8d5b2c08
ED
285 dev = ign->fb_tunnel_dev;
286 if (dev->flags & IFF_UP)
287 return netdev_priv(dev);
749c10f9 288
1da177e4
LT
289 return NULL;
290}
291
1507850b 292static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
f57e7d5a 293 struct ip_tunnel_parm *parms)
1da177e4 294{
5056a1ef
YH
295 __be32 remote = parms->iph.daddr;
296 __be32 local = parms->iph.saddr;
297 __be32 key = parms->i_key;
1507850b 298 unsigned int h = HASH(key);
1da177e4
LT
299 int prio = 0;
300
301 if (local)
302 prio |= 1;
f97c1e0c 303 if (remote && !ipv4_is_multicast(remote)) {
1da177e4
LT
304 prio |= 2;
305 h ^= HASH(remote);
306 }
307
eb8ce741 308 return &ign->tunnels[prio][h];
1da177e4
LT
309}
310
1507850b 311static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
f57e7d5a 312 struct ip_tunnel *t)
5056a1ef 313{
f57e7d5a 314 return __ipgre_bucket(ign, &t->parms);
5056a1ef
YH
315}
316
f57e7d5a 317static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
1da177e4 318{
1507850b 319 struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
1da177e4 320
1507850b 321 rcu_assign_pointer(t->next, rtnl_dereference(*tp));
8d5b2c08 322 rcu_assign_pointer(*tp, t);
1da177e4
LT
323}
324
f57e7d5a 325static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
1da177e4 326{
1507850b
ED
327 struct ip_tunnel __rcu **tp;
328 struct ip_tunnel *iter;
329
330 for (tp = ipgre_bucket(ign, t);
331 (iter = rtnl_dereference(*tp)) != NULL;
332 tp = &iter->next) {
333 if (t == iter) {
334 rcu_assign_pointer(*tp, t->next);
1da177e4
LT
335 break;
336 }
337 }
338}
339
e1a80002
HX
340static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
341 struct ip_tunnel_parm *parms,
342 int type)
1da177e4 343{
d5a0a1e3
AV
344 __be32 remote = parms->iph.daddr;
345 __be32 local = parms->iph.saddr;
346 __be32 key = parms->i_key;
749c10f9 347 int link = parms->link;
1507850b
ED
348 struct ip_tunnel *t;
349 struct ip_tunnel __rcu **tp;
e1a80002
HX
350 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
351
1507850b
ED
352 for (tp = __ipgre_bucket(ign, parms);
353 (t = rtnl_dereference(*tp)) != NULL;
354 tp = &t->next)
e1a80002
HX
355 if (local == t->parms.iph.saddr &&
356 remote == t->parms.iph.daddr &&
357 key == t->parms.i_key &&
749c10f9 358 link == t->parms.link &&
e1a80002
HX
359 type == t->dev->type)
360 break;
361
362 return t;
363}
364
1507850b 365static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
e1a80002
HX
366 struct ip_tunnel_parm *parms, int create)
367{
368 struct ip_tunnel *t, *nt;
1da177e4 369 struct net_device *dev;
1da177e4 370 char name[IFNAMSIZ];
f57e7d5a 371 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1da177e4 372
e1a80002
HX
373 t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
374 if (t || !create)
375 return t;
1da177e4
LT
376
377 if (parms->name[0])
378 strlcpy(name, parms->name, IFNAMSIZ);
34cc7ba6
PE
379 else
380 sprintf(name, "gre%%d");
1da177e4
LT
381
382 dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
383 if (!dev)
384 return NULL;
385
0b67eceb
PE
386 dev_net_set(dev, net);
387
b37d428b
PE
388 if (strchr(name, '%')) {
389 if (dev_alloc_name(dev, name) < 0)
390 goto failed_free;
391 }
392
2941a486 393 nt = netdev_priv(dev);
1da177e4 394 nt->parms = *parms;
c19e654d 395 dev->rtnl_link_ops = &ipgre_link_ops;
1da177e4 396
42aa9162
HX
397 dev->mtu = ipgre_tunnel_bind_dev(dev);
398
b37d428b
PE
399 if (register_netdevice(dev) < 0)
400 goto failed_free;
1da177e4 401
1da177e4 402 dev_hold(dev);
f57e7d5a 403 ipgre_tunnel_link(ign, nt);
1da177e4
LT
404 return nt;
405
b37d428b
PE
406failed_free:
407 free_netdev(dev);
1da177e4
LT
408 return NULL;
409}
410
411static void ipgre_tunnel_uninit(struct net_device *dev)
412{
f57e7d5a
PE
413 struct net *net = dev_net(dev);
414 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
415
416 ipgre_tunnel_unlink(ign, netdev_priv(dev));
1da177e4
LT
417 dev_put(dev);
418}
419
420
421static void ipgre_err(struct sk_buff *skb, u32 info)
422{
1da177e4 423
071f92d0 424/* All the routers (except for Linux) return only
1da177e4
LT
425 8 bytes of packet payload. It means, that precise relaying of
426 ICMP in the real Internet is absolutely infeasible.
427
428 Moreover, Cisco "wise men" put GRE key to the third word
429 in GRE header. It makes impossible maintaining even soft state for keyed
430 GRE tunnels with enabled checksum. Tell them "thank you".
431
432 Well, I wonder, rfc1812 was written by Cisco employee,
433 what the hell these idiots break standrads established
434 by themself???
435 */
436
6ed2533e 437 struct iphdr *iph = (struct iphdr *)skb->data;
d5a0a1e3 438 __be16 *p = (__be16*)(skb->data+(iph->ihl<<2));
1da177e4 439 int grehlen = (iph->ihl<<2) + 4;
88c7664f
ACM
440 const int type = icmp_hdr(skb)->type;
441 const int code = icmp_hdr(skb)->code;
1da177e4 442 struct ip_tunnel *t;
d5a0a1e3 443 __be16 flags;
1da177e4
LT
444
445 flags = p[0];
446 if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
447 if (flags&(GRE_VERSION|GRE_ROUTING))
448 return;
449 if (flags&GRE_KEY) {
450 grehlen += 4;
451 if (flags&GRE_CSUM)
452 grehlen += 4;
453 }
454 }
455
456 /* If only 8 bytes returned, keyed message will be dropped here */
457 if (skb_headlen(skb) < grehlen)
458 return;
459
460 switch (type) {
461 default:
462 case ICMP_PARAMETERPROB:
463 return;
464
465 case ICMP_DEST_UNREACH:
466 switch (code) {
467 case ICMP_SR_FAILED:
468 case ICMP_PORT_UNREACH:
469 /* Impossible event. */
470 return;
471 case ICMP_FRAG_NEEDED:
472 /* Soft state for pmtu is maintained by IP core. */
473 return;
474 default:
475 /* All others are translated to HOST_UNREACH.
476 rfc2003 contains "deep thoughts" about NET_UNREACH,
477 I believe they are just ether pollution. --ANK
478 */
479 break;
480 }
481 break;
482 case ICMP_TIME_EXCEEDED:
483 if (code != ICMP_EXC_TTL)
484 return;
485 break;
486 }
487
8d5b2c08 488 rcu_read_lock();
749c10f9 489 t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
e1a80002
HX
490 flags & GRE_KEY ?
491 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
492 p[1]);
f97c1e0c
JP
493 if (t == NULL || t->parms.iph.daddr == 0 ||
494 ipv4_is_multicast(t->parms.iph.daddr))
1da177e4
LT
495 goto out;
496
497 if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
498 goto out;
499
da6185d8 500 if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
1da177e4
LT
501 t->err_count++;
502 else
503 t->err_count = 1;
504 t->err_time = jiffies;
505out:
8d5b2c08 506 rcu_read_unlock();
1da177e4
LT
507}
508
509static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
510{
511 if (INET_ECN_is_ce(iph->tos)) {
512 if (skb->protocol == htons(ETH_P_IP)) {
eddc9ec5 513 IP_ECN_set_ce(ip_hdr(skb));
1da177e4 514 } else if (skb->protocol == htons(ETH_P_IPV6)) {
0660e03f 515 IP6_ECN_set_ce(ipv6_hdr(skb));
1da177e4
LT
516 }
517 }
518}
519
520static inline u8
521ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
522{
523 u8 inner = 0;
524 if (skb->protocol == htons(ETH_P_IP))
525 inner = old_iph->tos;
526 else if (skb->protocol == htons(ETH_P_IPV6))
527 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
528 return INET_ECN_encapsulate(tos, inner);
529}
530
531static int ipgre_rcv(struct sk_buff *skb)
532{
533 struct iphdr *iph;
534 u8 *h;
d5a0a1e3 535 __be16 flags;
d3bc23e7 536 __sum16 csum = 0;
d5a0a1e3 537 __be32 key = 0;
1da177e4
LT
538 u32 seqno = 0;
539 struct ip_tunnel *tunnel;
540 int offset = 4;
e1a80002 541 __be16 gre_proto;
1da177e4
LT
542
543 if (!pskb_may_pull(skb, 16))
544 goto drop_nolock;
545
eddc9ec5 546 iph = ip_hdr(skb);
1da177e4 547 h = skb->data;
d5a0a1e3 548 flags = *(__be16*)h;
1da177e4
LT
549
550 if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
551 /* - Version must be 0.
552 - We do not support routing headers.
553 */
554 if (flags&(GRE_VERSION|GRE_ROUTING))
555 goto drop_nolock;
556
557 if (flags&GRE_CSUM) {
fb286bb2 558 switch (skb->ip_summed) {
84fa7933 559 case CHECKSUM_COMPLETE:
d3bc23e7 560 csum = csum_fold(skb->csum);
fb286bb2
HX
561 if (!csum)
562 break;
563 /* fall through */
564 case CHECKSUM_NONE:
565 skb->csum = 0;
566 csum = __skb_checksum_complete(skb);
84fa7933 567 skb->ip_summed = CHECKSUM_COMPLETE;
1da177e4
LT
568 }
569 offset += 4;
570 }
571 if (flags&GRE_KEY) {
d5a0a1e3 572 key = *(__be32*)(h + offset);
1da177e4
LT
573 offset += 4;
574 }
575 if (flags&GRE_SEQ) {
d5a0a1e3 576 seqno = ntohl(*(__be32*)(h + offset));
1da177e4
LT
577 offset += 4;
578 }
579 }
580
e1a80002
HX
581 gre_proto = *(__be16 *)(h + 2);
582
8d5b2c08 583 rcu_read_lock();
749c10f9 584 if ((tunnel = ipgre_tunnel_lookup(skb->dev,
e1a80002
HX
585 iph->saddr, iph->daddr, key,
586 gre_proto))) {
addd68eb
PE
587 struct net_device_stats *stats = &tunnel->dev->stats;
588
1da177e4
LT
589 secpath_reset(skb);
590
e1a80002 591 skb->protocol = gre_proto;
1da177e4
LT
592 /* WCCP version 1 and 2 protocol decoding.
593 * - Change protocol to IP
594 * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
595 */
e1a80002 596 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
496c98df 597 skb->protocol = htons(ETH_P_IP);
e905a9ed 598 if ((*(h + offset) & 0xF0) != 0x40)
1da177e4
LT
599 offset += 4;
600 }
601
1d069167 602 skb->mac_header = skb->network_header;
4209fb60 603 __pskb_pull(skb, offset);
9c70220b 604 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
1da177e4
LT
605 skb->pkt_type = PACKET_HOST;
606#ifdef CONFIG_NET_IPGRE_BROADCAST
f97c1e0c 607 if (ipv4_is_multicast(iph->daddr)) {
1da177e4 608 /* Looped back packet, drop it! */
511c3f92 609 if (skb_rtable(skb)->fl.iif == 0)
1da177e4 610 goto drop;
addd68eb 611 stats->multicast++;
1da177e4
LT
612 skb->pkt_type = PACKET_BROADCAST;
613 }
614#endif
615
616 if (((flags&GRE_CSUM) && csum) ||
617 (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
addd68eb
PE
618 stats->rx_crc_errors++;
619 stats->rx_errors++;
1da177e4
LT
620 goto drop;
621 }
622 if (tunnel->parms.i_flags&GRE_SEQ) {
623 if (!(flags&GRE_SEQ) ||
624 (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
addd68eb
PE
625 stats->rx_fifo_errors++;
626 stats->rx_errors++;
1da177e4
LT
627 goto drop;
628 }
629 tunnel->i_seqno = seqno + 1;
630 }
e1a80002
HX
631
632 /* Warning: All skb pointers will be invalidated! */
633 if (tunnel->dev->type == ARPHRD_ETHER) {
634 if (!pskb_may_pull(skb, ETH_HLEN)) {
635 stats->rx_length_errors++;
636 stats->rx_errors++;
637 goto drop;
638 }
639
640 iph = ip_hdr(skb);
641 skb->protocol = eth_type_trans(skb, tunnel->dev);
642 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
643 }
644
d19d56dd 645 skb_tunnel_rx(skb, tunnel->dev);
e1a80002
HX
646
647 skb_reset_network_header(skb);
1da177e4 648 ipgre_ecn_decapsulate(iph, skb);
e1a80002 649
8990f468
ED
650 if (netif_rx(skb) == NET_RX_DROP)
651 stats->rx_dropped++;
652
8d5b2c08 653 rcu_read_unlock();
8990f468 654 return 0;
1da177e4 655 }
45af08be 656 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
1da177e4
LT
657
658drop:
8d5b2c08 659 rcu_read_unlock();
1da177e4
LT
660drop_nolock:
661 kfree_skb(skb);
a02cec21 662 return 0;
1da177e4
LT
663}
664
6fef4c0c 665static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
1da177e4 666{
2941a486 667 struct ip_tunnel *tunnel = netdev_priv(dev);
0bfbedb1
ED
668 struct net_device_stats *stats = &dev->stats;
669 struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
eddc9ec5 670 struct iphdr *old_iph = ip_hdr(skb);
1da177e4
LT
671 struct iphdr *tiph;
672 u8 tos;
d5a0a1e3 673 __be16 df;
1da177e4 674 struct rtable *rt; /* Route to the other host */
1507850b 675 struct net_device *tdev; /* Device to other host */
1da177e4 676 struct iphdr *iph; /* Our new IP header */
c2636b4d 677 unsigned int max_headroom; /* The extra header space needed */
1da177e4 678 int gre_hlen;
d5a0a1e3 679 __be32 dst;
1da177e4
LT
680 int mtu;
681
e1a80002
HX
682 if (dev->type == ARPHRD_ETHER)
683 IPCB(skb)->flags = 0;
684
685 if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
1da177e4 686 gre_hlen = 0;
6ed2533e 687 tiph = (struct iphdr *)skb->data;
1da177e4
LT
688 } else {
689 gre_hlen = tunnel->hlen;
690 tiph = &tunnel->parms.iph;
691 }
692
693 if ((dst = tiph->daddr) == 0) {
694 /* NBMA tunnel */
695
adf30907 696 if (skb_dst(skb) == NULL) {
addd68eb 697 stats->tx_fifo_errors++;
1da177e4
LT
698 goto tx_error;
699 }
700
701 if (skb->protocol == htons(ETH_P_IP)) {
511c3f92 702 rt = skb_rtable(skb);
1da177e4
LT
703 if ((dst = rt->rt_gateway) == 0)
704 goto tx_error_icmp;
705 }
842c74bf 706#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1da177e4
LT
707 else if (skb->protocol == htons(ETH_P_IPV6)) {
708 struct in6_addr *addr6;
709 int addr_type;
adf30907 710 struct neighbour *neigh = skb_dst(skb)->neighbour;
1da177e4
LT
711
712 if (neigh == NULL)
713 goto tx_error;
714
6ed2533e 715 addr6 = (struct in6_addr *)&neigh->primary_key;
1da177e4
LT
716 addr_type = ipv6_addr_type(addr6);
717
718 if (addr_type == IPV6_ADDR_ANY) {
0660e03f 719 addr6 = &ipv6_hdr(skb)->daddr;
1da177e4
LT
720 addr_type = ipv6_addr_type(addr6);
721 }
722
723 if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
724 goto tx_error_icmp;
725
726 dst = addr6->s6_addr32[3];
727 }
728#endif
729 else
730 goto tx_error;
731 }
732
733 tos = tiph->tos;
ee686ca9
AJ
734 if (tos == 1) {
735 tos = 0;
1da177e4
LT
736 if (skb->protocol == htons(ETH_P_IP))
737 tos = old_iph->tos;
dd4ba83d
SH
738 else if (skb->protocol == htons(ETH_P_IPV6))
739 tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
1da177e4
LT
740 }
741
742 {
743 struct flowi fl = { .oif = tunnel->parms.link,
744 .nl_u = { .ip4_u =
745 { .daddr = dst,
746 .saddr = tiph->saddr,
747 .tos = RT_TOS(tos) } },
748 .proto = IPPROTO_GRE };
96635522 749 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
addd68eb 750 stats->tx_carrier_errors++;
1da177e4
LT
751 goto tx_error;
752 }
753 }
d8d1f30b 754 tdev = rt->dst.dev;
1da177e4
LT
755
756 if (tdev == dev) {
757 ip_rt_put(rt);
addd68eb 758 stats->collisions++;
1da177e4
LT
759 goto tx_error;
760 }
761
762 df = tiph->frag_off;
763 if (df)
d8d1f30b 764 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
1da177e4 765 else
adf30907 766 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
1da177e4 767
adf30907
ED
768 if (skb_dst(skb))
769 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
1da177e4
LT
770
771 if (skb->protocol == htons(ETH_P_IP)) {
772 df |= (old_iph->frag_off&htons(IP_DF));
773
774 if ((old_iph->frag_off&htons(IP_DF)) &&
775 mtu < ntohs(old_iph->tot_len)) {
776 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
777 ip_rt_put(rt);
778 goto tx_error;
779 }
780 }
842c74bf 781#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1da177e4 782 else if (skb->protocol == htons(ETH_P_IPV6)) {
adf30907 783 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
1da177e4 784
adf30907 785 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
f97c1e0c
JP
786 if ((tunnel->parms.iph.daddr &&
787 !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
1da177e4
LT
788 rt6->rt6i_dst.plen == 128) {
789 rt6->rt6i_flags |= RTF_MODIFIED;
adf30907 790 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
1da177e4
LT
791 }
792 }
793
794 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
3ffe533c 795 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1da177e4
LT
796 ip_rt_put(rt);
797 goto tx_error;
798 }
799 }
800#endif
801
802 if (tunnel->err_count > 0) {
da6185d8
WY
803 if (time_before(jiffies,
804 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
1da177e4
LT
805 tunnel->err_count--;
806
807 dst_link_failure(skb);
808 } else
809 tunnel->err_count = 0;
810 }
811
d8d1f30b 812 max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
1da177e4 813
cfbba49d
PM
814 if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
815 (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
1da177e4 816 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
243aad83
TT
817 if (max_headroom > dev->needed_headroom)
818 dev->needed_headroom = max_headroom;
1da177e4
LT
819 if (!new_skb) {
820 ip_rt_put(rt);
0bfbedb1 821 txq->tx_dropped++;
1da177e4 822 dev_kfree_skb(skb);
6ed10654 823 return NETDEV_TX_OK;
1da177e4
LT
824 }
825 if (skb->sk)
826 skb_set_owner_w(new_skb, skb->sk);
827 dev_kfree_skb(skb);
828 skb = new_skb;
eddc9ec5 829 old_iph = ip_hdr(skb);
1da177e4
LT
830 }
831
64194c31 832 skb_reset_transport_header(skb);
e2d1bca7
ACM
833 skb_push(skb, gre_hlen);
834 skb_reset_network_header(skb);
1da177e4 835 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
48d5cad8
PM
836 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
837 IPSKB_REROUTED);
adf30907 838 skb_dst_drop(skb);
d8d1f30b 839 skb_dst_set(skb, &rt->dst);
1da177e4
LT
840
841 /*
842 * Push down and install the IPIP header.
843 */
844
eddc9ec5 845 iph = ip_hdr(skb);
1da177e4
LT
846 iph->version = 4;
847 iph->ihl = sizeof(struct iphdr) >> 2;
848 iph->frag_off = df;
849 iph->protocol = IPPROTO_GRE;
850 iph->tos = ipgre_ecn_encapsulate(tos, old_iph, skb);
851 iph->daddr = rt->rt_dst;
852 iph->saddr = rt->rt_src;
853
854 if ((iph->ttl = tiph->ttl) == 0) {
855 if (skb->protocol == htons(ETH_P_IP))
856 iph->ttl = old_iph->ttl;
842c74bf 857#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
1da177e4 858 else if (skb->protocol == htons(ETH_P_IPV6))
6ed2533e 859 iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
1da177e4
LT
860#endif
861 else
d8d1f30b 862 iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
1da177e4
LT
863 }
864
e1a80002
HX
865 ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
866 ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
867 htons(ETH_P_TEB) : skb->protocol;
1da177e4
LT
868
869 if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
d5a0a1e3 870 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
1da177e4
LT
871
872 if (tunnel->parms.o_flags&GRE_SEQ) {
873 ++tunnel->o_seqno;
874 *ptr = htonl(tunnel->o_seqno);
875 ptr--;
876 }
877 if (tunnel->parms.o_flags&GRE_KEY) {
878 *ptr = tunnel->parms.o_key;
879 ptr--;
880 }
881 if (tunnel->parms.o_flags&GRE_CSUM) {
882 *ptr = 0;
5f92a738 883 *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
1da177e4
LT
884 }
885 }
886
887 nf_reset(skb);
888
889 IPTUNNEL_XMIT();
6ed10654 890 return NETDEV_TX_OK;
1da177e4
LT
891
892tx_error_icmp:
893 dst_link_failure(skb);
894
895tx_error:
896 stats->tx_errors++;
897 dev_kfree_skb(skb);
6ed10654 898 return NETDEV_TX_OK;
1da177e4
LT
899}
900
42aa9162 901static int ipgre_tunnel_bind_dev(struct net_device *dev)
ee34c1eb
MS
902{
903 struct net_device *tdev = NULL;
904 struct ip_tunnel *tunnel;
905 struct iphdr *iph;
906 int hlen = LL_MAX_HEADER;
907 int mtu = ETH_DATA_LEN;
908 int addend = sizeof(struct iphdr) + 4;
909
910 tunnel = netdev_priv(dev);
911 iph = &tunnel->parms.iph;
912
c95b819a 913 /* Guess output device to choose reasonable mtu and needed_headroom */
ee34c1eb
MS
914
915 if (iph->daddr) {
916 struct flowi fl = { .oif = tunnel->parms.link,
917 .nl_u = { .ip4_u =
918 { .daddr = iph->daddr,
919 .saddr = iph->saddr,
920 .tos = RT_TOS(iph->tos) } },
921 .proto = IPPROTO_GRE };
922 struct rtable *rt;
96635522 923 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
d8d1f30b 924 tdev = rt->dst.dev;
ee34c1eb
MS
925 ip_rt_put(rt);
926 }
e1a80002
HX
927
928 if (dev->type != ARPHRD_ETHER)
929 dev->flags |= IFF_POINTOPOINT;
ee34c1eb
MS
930 }
931
932 if (!tdev && tunnel->parms.link)
96635522 933 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
ee34c1eb
MS
934
935 if (tdev) {
c95b819a 936 hlen = tdev->hard_header_len + tdev->needed_headroom;
ee34c1eb
MS
937 mtu = tdev->mtu;
938 }
939 dev->iflink = tunnel->parms.link;
940
941 /* Precalculate GRE options length */
942 if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
943 if (tunnel->parms.o_flags&GRE_CSUM)
944 addend += 4;
945 if (tunnel->parms.o_flags&GRE_KEY)
946 addend += 4;
947 if (tunnel->parms.o_flags&GRE_SEQ)
948 addend += 4;
949 }
c95b819a 950 dev->needed_headroom = addend + hlen;
8cdb0456 951 mtu -= dev->hard_header_len + addend;
42aa9162
HX
952
953 if (mtu < 68)
954 mtu = 68;
955
ee34c1eb
MS
956 tunnel->hlen = addend;
957
42aa9162 958 return mtu;
ee34c1eb
MS
959}
960
1da177e4
LT
961static int
962ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
963{
964 int err = 0;
965 struct ip_tunnel_parm p;
966 struct ip_tunnel *t;
f57e7d5a
PE
967 struct net *net = dev_net(dev);
968 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1da177e4
LT
969
970 switch (cmd) {
971 case SIOCGETTUNNEL:
972 t = NULL;
7daa0004 973 if (dev == ign->fb_tunnel_dev) {
1da177e4
LT
974 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
975 err = -EFAULT;
976 break;
977 }
f57e7d5a 978 t = ipgre_tunnel_locate(net, &p, 0);
1da177e4
LT
979 }
980 if (t == NULL)
2941a486 981 t = netdev_priv(dev);
1da177e4
LT
982 memcpy(&p, &t->parms, sizeof(p));
983 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
984 err = -EFAULT;
985 break;
986
987 case SIOCADDTUNNEL:
988 case SIOCCHGTUNNEL:
989 err = -EPERM;
990 if (!capable(CAP_NET_ADMIN))
991 goto done;
992
993 err = -EFAULT;
994 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
995 goto done;
996
997 err = -EINVAL;
998 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
999 p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1000 ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1001 goto done;
1002 if (p.iph.ttl)
1003 p.iph.frag_off |= htons(IP_DF);
1004
1005 if (!(p.i_flags&GRE_KEY))
1006 p.i_key = 0;
1007 if (!(p.o_flags&GRE_KEY))
1008 p.o_key = 0;
1009
f57e7d5a 1010 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1da177e4 1011
7daa0004 1012 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1da177e4
LT
1013 if (t != NULL) {
1014 if (t->dev != dev) {
1015 err = -EEXIST;
1016 break;
1017 }
1018 } else {
1507850b 1019 unsigned int nflags = 0;
1da177e4 1020
2941a486 1021 t = netdev_priv(dev);
1da177e4 1022
f97c1e0c 1023 if (ipv4_is_multicast(p.iph.daddr))
1da177e4
LT
1024 nflags = IFF_BROADCAST;
1025 else if (p.iph.daddr)
1026 nflags = IFF_POINTOPOINT;
1027
1028 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1029 err = -EINVAL;
1030 break;
1031 }
f57e7d5a 1032 ipgre_tunnel_unlink(ign, t);
1da177e4
LT
1033 t->parms.iph.saddr = p.iph.saddr;
1034 t->parms.iph.daddr = p.iph.daddr;
1035 t->parms.i_key = p.i_key;
1036 t->parms.o_key = p.o_key;
1037 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1038 memcpy(dev->broadcast, &p.iph.daddr, 4);
f57e7d5a 1039 ipgre_tunnel_link(ign, t);
1da177e4
LT
1040 netdev_state_change(dev);
1041 }
1042 }
1043
1044 if (t) {
1045 err = 0;
1046 if (cmd == SIOCCHGTUNNEL) {
1047 t->parms.iph.ttl = p.iph.ttl;
1048 t->parms.iph.tos = p.iph.tos;
1049 t->parms.iph.frag_off = p.iph.frag_off;
ee34c1eb
MS
1050 if (t->parms.link != p.link) {
1051 t->parms.link = p.link;
42aa9162 1052 dev->mtu = ipgre_tunnel_bind_dev(dev);
ee34c1eb
MS
1053 netdev_state_change(dev);
1054 }
1da177e4
LT
1055 }
1056 if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1057 err = -EFAULT;
1058 } else
1059 err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1060 break;
1061
1062 case SIOCDELTUNNEL:
1063 err = -EPERM;
1064 if (!capable(CAP_NET_ADMIN))
1065 goto done;
1066
7daa0004 1067 if (dev == ign->fb_tunnel_dev) {
1da177e4
LT
1068 err = -EFAULT;
1069 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1070 goto done;
1071 err = -ENOENT;
f57e7d5a 1072 if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1da177e4
LT
1073 goto done;
1074 err = -EPERM;
7daa0004 1075 if (t == netdev_priv(ign->fb_tunnel_dev))
1da177e4
LT
1076 goto done;
1077 dev = t->dev;
1078 }
22f8cde5
SH
1079 unregister_netdevice(dev);
1080 err = 0;
1da177e4
LT
1081 break;
1082
1083 default:
1084 err = -EINVAL;
1085 }
1086
1087done:
1088 return err;
1089}
1090
1da177e4
LT
1091static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1092{
2941a486 1093 struct ip_tunnel *tunnel = netdev_priv(dev);
c95b819a
HX
1094 if (new_mtu < 68 ||
1095 new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1da177e4
LT
1096 return -EINVAL;
1097 dev->mtu = new_mtu;
1098 return 0;
1099}
1100
1da177e4
LT
1101/* Nice toy. Unfortunately, useless in real life :-)
1102 It allows to construct virtual multiprotocol broadcast "LAN"
1103 over the Internet, provided multicast routing is tuned.
1104
1105
1106 I have no idea was this bicycle invented before me,
1107 so that I had to set ARPHRD_IPGRE to a random value.
1108 I have an impression, that Cisco could make something similar,
1109 but this feature is apparently missing in IOS<=11.2(8).
e905a9ed 1110
1da177e4
LT
1111 I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1112 with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1113
1114 ping -t 255 224.66.66.66
1115
1116 If nobody answers, mbone does not work.
1117
1118 ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1119 ip addr add 10.66.66.<somewhat>/24 dev Universe
1120 ifconfig Universe up
1121 ifconfig Universe add fe80::<Your_real_addr>/10
1122 ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1123 ftp 10.66.66.66
1124 ...
1125 ftp fec0:6666:6666::193.233.7.65
1126 ...
1127
1128 */
1129
3b04ddde
SH
1130static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1131 unsigned short type,
1507850b 1132 const void *daddr, const void *saddr, unsigned int len)
1da177e4 1133{
2941a486 1134 struct ip_tunnel *t = netdev_priv(dev);
1da177e4 1135 struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
d5a0a1e3 1136 __be16 *p = (__be16*)(iph+1);
1da177e4
LT
1137
1138 memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1139 p[0] = t->parms.o_flags;
1140 p[1] = htons(type);
1141
1142 /*
e905a9ed 1143 * Set the source hardware address.
1da177e4 1144 */
e905a9ed 1145
1da177e4
LT
1146 if (saddr)
1147 memcpy(&iph->saddr, saddr, 4);
6d55cb91 1148 if (daddr)
1da177e4 1149 memcpy(&iph->daddr, daddr, 4);
6d55cb91 1150 if (iph->daddr)
1da177e4 1151 return t->hlen;
e905a9ed 1152
1da177e4
LT
1153 return -t->hlen;
1154}
1155
6a5f44d7
TT
1156static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1157{
6ed2533e 1158 struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
6a5f44d7
TT
1159 memcpy(haddr, &iph->saddr, 4);
1160 return 4;
1161}
1162
3b04ddde
SH
1163static const struct header_ops ipgre_header_ops = {
1164 .create = ipgre_header,
6a5f44d7 1165 .parse = ipgre_header_parse,
3b04ddde
SH
1166};
1167
6a5f44d7 1168#ifdef CONFIG_NET_IPGRE_BROADCAST
1da177e4
LT
1169static int ipgre_open(struct net_device *dev)
1170{
2941a486 1171 struct ip_tunnel *t = netdev_priv(dev);
1da177e4 1172
f97c1e0c 1173 if (ipv4_is_multicast(t->parms.iph.daddr)) {
1da177e4
LT
1174 struct flowi fl = { .oif = t->parms.link,
1175 .nl_u = { .ip4_u =
1176 { .daddr = t->parms.iph.daddr,
1177 .saddr = t->parms.iph.saddr,
1178 .tos = RT_TOS(t->parms.iph.tos) } },
1179 .proto = IPPROTO_GRE };
1180 struct rtable *rt;
96635522 1181 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1da177e4 1182 return -EADDRNOTAVAIL;
d8d1f30b 1183 dev = rt->dst.dev;
1da177e4 1184 ip_rt_put(rt);
e5ed6399 1185 if (__in_dev_get_rtnl(dev) == NULL)
1da177e4
LT
1186 return -EADDRNOTAVAIL;
1187 t->mlink = dev->ifindex;
e5ed6399 1188 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1da177e4
LT
1189 }
1190 return 0;
1191}
1192
1193static int ipgre_close(struct net_device *dev)
1194{
2941a486 1195 struct ip_tunnel *t = netdev_priv(dev);
b8c26a33 1196
f97c1e0c 1197 if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
7fee0ca2 1198 struct in_device *in_dev;
c346dca1 1199 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1da177e4
LT
1200 if (in_dev) {
1201 ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1202 in_dev_put(in_dev);
1203 }
1204 }
1205 return 0;
1206}
1207
1208#endif
1209
b8c26a33
SH
1210static const struct net_device_ops ipgre_netdev_ops = {
1211 .ndo_init = ipgre_tunnel_init,
1212 .ndo_uninit = ipgre_tunnel_uninit,
1213#ifdef CONFIG_NET_IPGRE_BROADCAST
1214 .ndo_open = ipgre_open,
1215 .ndo_stop = ipgre_close,
1216#endif
1217 .ndo_start_xmit = ipgre_tunnel_xmit,
1218 .ndo_do_ioctl = ipgre_tunnel_ioctl,
1219 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1220};
1221
1da177e4
LT
1222static void ipgre_tunnel_setup(struct net_device *dev)
1223{
b8c26a33 1224 dev->netdev_ops = &ipgre_netdev_ops;
1da177e4 1225 dev->destructor = free_netdev;
1da177e4
LT
1226
1227 dev->type = ARPHRD_IPGRE;
c95b819a 1228 dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
46f25dff 1229 dev->mtu = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1da177e4
LT
1230 dev->flags = IFF_NOARP;
1231 dev->iflink = 0;
1232 dev->addr_len = 4;
0b67eceb 1233 dev->features |= NETIF_F_NETNS_LOCAL;
108bfa89 1234 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
1da177e4
LT
1235}
1236
1237static int ipgre_tunnel_init(struct net_device *dev)
1238{
1da177e4
LT
1239 struct ip_tunnel *tunnel;
1240 struct iphdr *iph;
1da177e4 1241
2941a486 1242 tunnel = netdev_priv(dev);
1da177e4
LT
1243 iph = &tunnel->parms.iph;
1244
1245 tunnel->dev = dev;
1246 strcpy(tunnel->parms.name, dev->name);
1247
1248 memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1249 memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1250
1da177e4 1251 if (iph->daddr) {
1da177e4 1252#ifdef CONFIG_NET_IPGRE_BROADCAST
f97c1e0c 1253 if (ipv4_is_multicast(iph->daddr)) {
1da177e4
LT
1254 if (!iph->saddr)
1255 return -EINVAL;
1256 dev->flags = IFF_BROADCAST;
3b04ddde 1257 dev->header_ops = &ipgre_header_ops;
1da177e4
LT
1258 }
1259#endif
ee34c1eb 1260 } else
6a5f44d7 1261 dev->header_ops = &ipgre_header_ops;
1da177e4 1262
1da177e4
LT
1263 return 0;
1264}
1265
b8c26a33 1266static void ipgre_fb_tunnel_init(struct net_device *dev)
1da177e4 1267{
2941a486 1268 struct ip_tunnel *tunnel = netdev_priv(dev);
1da177e4 1269 struct iphdr *iph = &tunnel->parms.iph;
eb8ce741 1270 struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1da177e4
LT
1271
1272 tunnel->dev = dev;
1273 strcpy(tunnel->parms.name, dev->name);
1274
1275 iph->version = 4;
1276 iph->protocol = IPPROTO_GRE;
1277 iph->ihl = 5;
1278 tunnel->hlen = sizeof(struct iphdr) + 4;
1279
1280 dev_hold(dev);
1507850b 1281 rcu_assign_pointer(ign->tunnels_wc[0], tunnel);
1da177e4
LT
1282}
1283
1284
00959ade
DK
1285static const struct gre_protocol ipgre_protocol = {
1286 .handler = ipgre_rcv,
1287 .err_handler = ipgre_err,
1da177e4
LT
1288};
1289
eef6dd65 1290static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
eb8ce741
PE
1291{
1292 int prio;
1293
1294 for (prio = 0; prio < 4; prio++) {
1295 int h;
1296 for (h = 0; h < HASH_SIZE; h++) {
1507850b
ED
1297 struct ip_tunnel *t;
1298
1299 t = rtnl_dereference(ign->tunnels[prio][h]);
eef6dd65
ED
1300
1301 while (t != NULL) {
1302 unregister_netdevice_queue(t->dev, head);
1507850b 1303 t = rtnl_dereference(t->next);
eef6dd65 1304 }
eb8ce741
PE
1305 }
1306 }
1307}
1308
2c8c1e72 1309static int __net_init ipgre_init_net(struct net *net)
59a4c759 1310{
cfb8fbf2 1311 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
59a4c759 1312 int err;
59a4c759 1313
7daa0004
PE
1314 ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1315 ipgre_tunnel_setup);
1316 if (!ign->fb_tunnel_dev) {
1317 err = -ENOMEM;
1318 goto err_alloc_dev;
1319 }
be77e593 1320 dev_net_set(ign->fb_tunnel_dev, net);
7daa0004 1321
b8c26a33 1322 ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
c19e654d 1323 ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
7daa0004
PE
1324
1325 if ((err = register_netdev(ign->fb_tunnel_dev)))
1326 goto err_reg_dev;
1327
59a4c759
PE
1328 return 0;
1329
7daa0004
PE
1330err_reg_dev:
1331 free_netdev(ign->fb_tunnel_dev);
1332err_alloc_dev:
59a4c759
PE
1333 return err;
1334}
1335
2c8c1e72 1336static void __net_exit ipgre_exit_net(struct net *net)
59a4c759
PE
1337{
1338 struct ipgre_net *ign;
eef6dd65 1339 LIST_HEAD(list);
59a4c759
PE
1340
1341 ign = net_generic(net, ipgre_net_id);
7daa0004 1342 rtnl_lock();
eef6dd65
ED
1343 ipgre_destroy_tunnels(ign, &list);
1344 unregister_netdevice_many(&list);
7daa0004 1345 rtnl_unlock();
59a4c759
PE
1346}
1347
1348static struct pernet_operations ipgre_net_ops = {
1349 .init = ipgre_init_net,
1350 .exit = ipgre_exit_net,
cfb8fbf2
EB
1351 .id = &ipgre_net_id,
1352 .size = sizeof(struct ipgre_net),
59a4c759 1353};
1da177e4 1354
c19e654d
HX
1355static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1356{
1357 __be16 flags;
1358
1359 if (!data)
1360 return 0;
1361
1362 flags = 0;
1363 if (data[IFLA_GRE_IFLAGS])
1364 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1365 if (data[IFLA_GRE_OFLAGS])
1366 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1367 if (flags & (GRE_VERSION|GRE_ROUTING))
1368 return -EINVAL;
1369
1370 return 0;
1371}
1372
e1a80002
HX
1373static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1374{
1375 __be32 daddr;
1376
1377 if (tb[IFLA_ADDRESS]) {
1378 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1379 return -EINVAL;
1380 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1381 return -EADDRNOTAVAIL;
1382 }
1383
1384 if (!data)
1385 goto out;
1386
1387 if (data[IFLA_GRE_REMOTE]) {
1388 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1389 if (!daddr)
1390 return -EINVAL;
1391 }
1392
1393out:
1394 return ipgre_tunnel_validate(tb, data);
1395}
1396
c19e654d
HX
1397static void ipgre_netlink_parms(struct nlattr *data[],
1398 struct ip_tunnel_parm *parms)
1399{
7bb82d92 1400 memset(parms, 0, sizeof(*parms));
c19e654d
HX
1401
1402 parms->iph.protocol = IPPROTO_GRE;
1403
1404 if (!data)
1405 return;
1406
1407 if (data[IFLA_GRE_LINK])
1408 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1409
1410 if (data[IFLA_GRE_IFLAGS])
1411 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1412
1413 if (data[IFLA_GRE_OFLAGS])
1414 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1415
1416 if (data[IFLA_GRE_IKEY])
1417 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1418
1419 if (data[IFLA_GRE_OKEY])
1420 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1421
1422 if (data[IFLA_GRE_LOCAL])
4d74f8ba 1423 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
c19e654d
HX
1424
1425 if (data[IFLA_GRE_REMOTE])
4d74f8ba 1426 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
c19e654d
HX
1427
1428 if (data[IFLA_GRE_TTL])
1429 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1430
1431 if (data[IFLA_GRE_TOS])
1432 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1433
1434 if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1435 parms->iph.frag_off = htons(IP_DF);
1436}
1437
e1a80002
HX
1438static int ipgre_tap_init(struct net_device *dev)
1439{
1440 struct ip_tunnel *tunnel;
1441
1442 tunnel = netdev_priv(dev);
1443
1444 tunnel->dev = dev;
1445 strcpy(tunnel->parms.name, dev->name);
1446
1447 ipgre_tunnel_bind_dev(dev);
1448
1449 return 0;
1450}
1451
b8c26a33
SH
1452static const struct net_device_ops ipgre_tap_netdev_ops = {
1453 .ndo_init = ipgre_tap_init,
1454 .ndo_uninit = ipgre_tunnel_uninit,
1455 .ndo_start_xmit = ipgre_tunnel_xmit,
1456 .ndo_set_mac_address = eth_mac_addr,
1457 .ndo_validate_addr = eth_validate_addr,
1458 .ndo_change_mtu = ipgre_tunnel_change_mtu,
1459};
1460
e1a80002
HX
1461static void ipgre_tap_setup(struct net_device *dev)
1462{
1463
1464 ether_setup(dev);
1465
2e9526b3 1466 dev->netdev_ops = &ipgre_tap_netdev_ops;
e1a80002 1467 dev->destructor = free_netdev;
e1a80002
HX
1468
1469 dev->iflink = 0;
1470 dev->features |= NETIF_F_NETNS_LOCAL;
1471}
1472
81adee47 1473static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
c19e654d
HX
1474 struct nlattr *data[])
1475{
1476 struct ip_tunnel *nt;
1477 struct net *net = dev_net(dev);
1478 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1479 int mtu;
1480 int err;
1481
1482 nt = netdev_priv(dev);
1483 ipgre_netlink_parms(data, &nt->parms);
1484
e1a80002 1485 if (ipgre_tunnel_find(net, &nt->parms, dev->type))
c19e654d
HX
1486 return -EEXIST;
1487
e1a80002
HX
1488 if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1489 random_ether_addr(dev->dev_addr);
1490
c19e654d
HX
1491 mtu = ipgre_tunnel_bind_dev(dev);
1492 if (!tb[IFLA_MTU])
1493 dev->mtu = mtu;
1494
1495 err = register_netdevice(dev);
1496 if (err)
1497 goto out;
1498
1499 dev_hold(dev);
1500 ipgre_tunnel_link(ign, nt);
1501
1502out:
1503 return err;
1504}
1505
1506static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1507 struct nlattr *data[])
1508{
1509 struct ip_tunnel *t, *nt;
1510 struct net *net = dev_net(dev);
1511 struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1512 struct ip_tunnel_parm p;
1513 int mtu;
1514
1515 if (dev == ign->fb_tunnel_dev)
1516 return -EINVAL;
1517
1518 nt = netdev_priv(dev);
1519 ipgre_netlink_parms(data, &p);
1520
1521 t = ipgre_tunnel_locate(net, &p, 0);
1522
1523 if (t) {
1524 if (t->dev != dev)
1525 return -EEXIST;
1526 } else {
c19e654d
HX
1527 t = nt;
1528
2e9526b3 1529 if (dev->type != ARPHRD_ETHER) {
1507850b 1530 unsigned int nflags = 0;
c19e654d 1531
2e9526b3
HX
1532 if (ipv4_is_multicast(p.iph.daddr))
1533 nflags = IFF_BROADCAST;
1534 else if (p.iph.daddr)
1535 nflags = IFF_POINTOPOINT;
1536
1537 if ((dev->flags ^ nflags) &
1538 (IFF_POINTOPOINT | IFF_BROADCAST))
1539 return -EINVAL;
1540 }
c19e654d
HX
1541
1542 ipgre_tunnel_unlink(ign, t);
1543 t->parms.iph.saddr = p.iph.saddr;
1544 t->parms.iph.daddr = p.iph.daddr;
1545 t->parms.i_key = p.i_key;
2e9526b3
HX
1546 if (dev->type != ARPHRD_ETHER) {
1547 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1548 memcpy(dev->broadcast, &p.iph.daddr, 4);
1549 }
c19e654d
HX
1550 ipgre_tunnel_link(ign, t);
1551 netdev_state_change(dev);
1552 }
1553
1554 t->parms.o_key = p.o_key;
1555 t->parms.iph.ttl = p.iph.ttl;
1556 t->parms.iph.tos = p.iph.tos;
1557 t->parms.iph.frag_off = p.iph.frag_off;
1558
1559 if (t->parms.link != p.link) {
1560 t->parms.link = p.link;
1561 mtu = ipgre_tunnel_bind_dev(dev);
1562 if (!tb[IFLA_MTU])
1563 dev->mtu = mtu;
1564 netdev_state_change(dev);
1565 }
1566
1567 return 0;
1568}
1569
1570static size_t ipgre_get_size(const struct net_device *dev)
1571{
1572 return
1573 /* IFLA_GRE_LINK */
1574 nla_total_size(4) +
1575 /* IFLA_GRE_IFLAGS */
1576 nla_total_size(2) +
1577 /* IFLA_GRE_OFLAGS */
1578 nla_total_size(2) +
1579 /* IFLA_GRE_IKEY */
1580 nla_total_size(4) +
1581 /* IFLA_GRE_OKEY */
1582 nla_total_size(4) +
1583 /* IFLA_GRE_LOCAL */
1584 nla_total_size(4) +
1585 /* IFLA_GRE_REMOTE */
1586 nla_total_size(4) +
1587 /* IFLA_GRE_TTL */
1588 nla_total_size(1) +
1589 /* IFLA_GRE_TOS */
1590 nla_total_size(1) +
1591 /* IFLA_GRE_PMTUDISC */
1592 nla_total_size(1) +
1593 0;
1594}
1595
1596static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1597{
1598 struct ip_tunnel *t = netdev_priv(dev);
1599 struct ip_tunnel_parm *p = &t->parms;
1600
1601 NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1602 NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1603 NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
ba9e64b1
PM
1604 NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1605 NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
4d74f8ba
PM
1606 NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1607 NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
c19e654d
HX
1608 NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1609 NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1610 NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1611
1612 return 0;
1613
1614nla_put_failure:
1615 return -EMSGSIZE;
1616}
1617
1618static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1619 [IFLA_GRE_LINK] = { .type = NLA_U32 },
1620 [IFLA_GRE_IFLAGS] = { .type = NLA_U16 },
1621 [IFLA_GRE_OFLAGS] = { .type = NLA_U16 },
1622 [IFLA_GRE_IKEY] = { .type = NLA_U32 },
1623 [IFLA_GRE_OKEY] = { .type = NLA_U32 },
4d74f8ba
PM
1624 [IFLA_GRE_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1625 [IFLA_GRE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
c19e654d
HX
1626 [IFLA_GRE_TTL] = { .type = NLA_U8 },
1627 [IFLA_GRE_TOS] = { .type = NLA_U8 },
1628 [IFLA_GRE_PMTUDISC] = { .type = NLA_U8 },
1629};
1630
1631static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1632 .kind = "gre",
1633 .maxtype = IFLA_GRE_MAX,
1634 .policy = ipgre_policy,
1635 .priv_size = sizeof(struct ip_tunnel),
1636 .setup = ipgre_tunnel_setup,
1637 .validate = ipgre_tunnel_validate,
1638 .newlink = ipgre_newlink,
1639 .changelink = ipgre_changelink,
1640 .get_size = ipgre_get_size,
1641 .fill_info = ipgre_fill_info,
1642};
1643
e1a80002
HX
1644static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1645 .kind = "gretap",
1646 .maxtype = IFLA_GRE_MAX,
1647 .policy = ipgre_policy,
1648 .priv_size = sizeof(struct ip_tunnel),
1649 .setup = ipgre_tap_setup,
1650 .validate = ipgre_tap_validate,
1651 .newlink = ipgre_newlink,
1652 .changelink = ipgre_changelink,
1653 .get_size = ipgre_get_size,
1654 .fill_info = ipgre_fill_info,
1655};
1656
1da177e4
LT
1657/*
1658 * And now the modules code and kernel interface.
1659 */
1660
1661static int __init ipgre_init(void)
1662{
1663 int err;
1664
1665 printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1666
cfb8fbf2 1667 err = register_pernet_device(&ipgre_net_ops);
59a4c759 1668 if (err < 0)
c2892f02
AD
1669 return err;
1670
00959ade 1671 err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
c2892f02
AD
1672 if (err < 0) {
1673 printk(KERN_INFO "ipgre init: can't add protocol\n");
1674 goto add_proto_failed;
1675 }
7daa0004 1676
c19e654d
HX
1677 err = rtnl_link_register(&ipgre_link_ops);
1678 if (err < 0)
1679 goto rtnl_link_failed;
1680
e1a80002
HX
1681 err = rtnl_link_register(&ipgre_tap_ops);
1682 if (err < 0)
1683 goto tap_ops_failed;
1684
c19e654d 1685out:
1da177e4 1686 return err;
c19e654d 1687
e1a80002
HX
1688tap_ops_failed:
1689 rtnl_link_unregister(&ipgre_link_ops);
c19e654d 1690rtnl_link_failed:
00959ade 1691 gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
c2892f02
AD
1692add_proto_failed:
1693 unregister_pernet_device(&ipgre_net_ops);
c19e654d 1694 goto out;
1da177e4
LT
1695}
1696
db44575f 1697static void __exit ipgre_fini(void)
1da177e4 1698{
e1a80002 1699 rtnl_link_unregister(&ipgre_tap_ops);
c19e654d 1700 rtnl_link_unregister(&ipgre_link_ops);
00959ade 1701 if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1da177e4 1702 printk(KERN_INFO "ipgre close: can't remove protocol\n");
c2892f02 1703 unregister_pernet_device(&ipgre_net_ops);
1da177e4
LT
1704}
1705
1706module_init(ipgre_init);
1707module_exit(ipgre_fini);
1708MODULE_LICENSE("GPL");
4d74f8ba
PM
1709MODULE_ALIAS_RTNL_LINK("gre");
1710MODULE_ALIAS_RTNL_LINK("gretap");