]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv4/ip_gre.c
net: The world is not perfect patch.
[net-next-2.6.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/if_ether.h>
31
32 #include <net/sock.h>
33 #include <net/ip.h>
34 #include <net/icmp.h>
35 #include <net/protocol.h>
36 #include <net/ipip.h>
37 #include <net/arp.h>
38 #include <net/checksum.h>
39 #include <net/dsfield.h>
40 #include <net/inet_ecn.h>
41 #include <net/xfrm.h>
42 #include <net/net_namespace.h>
43 #include <net/netns/generic.h>
44
45 #ifdef CONFIG_IPV6
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #endif
50
51 /*
52    Problems & solutions
53    --------------------
54
55    1. The most important issue is detecting local dead loops.
56    They would cause complete host lockup in transmit, which
57    would be "resolved" by stack overflow or, if queueing is enabled,
58    with infinite looping in net_bh.
59
60    We cannot track such dead loops during route installation,
61    it is infeasible task. The most general solutions would be
62    to keep skb->encapsulation counter (sort of local ttl),
63    and silently drop packet when it expires. It is the best
64    solution, but it supposes maintaing new variable in ALL
65    skb, even if no tunneling is used.
66
67    Current solution: t->recursion lock breaks dead loops. It looks
68    like dev->tbusy flag, but I preferred new variable, because
69    the semantics is different. One day, when hard_start_xmit
70    will be multithreaded we will have to use skb->encapsulation.
71
72
73
74    2. Networking dead loops would not kill routers, but would really
75    kill network. IP hop limit plays role of "t->recursion" in this case,
76    if we copy it from packet being encapsulated to upper header.
77    It is very good solution, but it introduces two problems:
78
79    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
80      do not work over tunnels.
81    - traceroute does not work. I planned to relay ICMP from tunnel,
82      so that this problem would be solved and traceroute output
83      would even more informative. This idea appeared to be wrong:
84      only Linux complies to rfc1812 now (yes, guys, Linux is the only
85      true router now :-)), all routers (at least, in neighbourhood of mine)
86      return only 8 bytes of payload. It is the end.
87
88    Hence, if we want that OSPF worked or traceroute said something reasonable,
89    we should search for another solution.
90
91    One of them is to parse packet trying to detect inner encapsulation
92    made by our node. It is difficult or even impossible, especially,
93    taking into account fragmentation. TO be short, tt is not solution at all.
94
95    Current solution: The solution was UNEXPECTEDLY SIMPLE.
96    We force DF flag on tunnels with preconfigured hop limit,
97    that is ALL. :-) Well, it does not remove the problem completely,
98    but exponential growth of network traffic is changed to linear
99    (branches, that exceed pmtu are pruned) and tunnel mtu
100    fastly degrades to value <68, where looping stops.
101    Yes, it is not good if there exists a router in the loop,
102    which does not force DF, even when encapsulating packets have DF set.
103    But it is not our problem! Nobody could accuse us, we made
104    all that we could make. Even if it is your gated who injected
105    fatal route to network, even if it were you who configured
106    fatal static route: you are innocent. :-)
107
108
109
110    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
111    practically identical code. It would be good to glue them
112    together, but it is not very evident, how to make them modular.
113    sit is integral part of IPv6, ipip and gre are naturally modular.
114    We could extract common parts (hash table, ioctl etc)
115    to a separate module (ip_tunnel.c).
116
117    Alexey Kuznetsov.
118  */
119
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
122
123 /* Fallback tunnel: no source, no destination, no key, no options */
124
125 static int ipgre_fb_tunnel_init(struct net_device *dev);
126
127 #define HASH_SIZE  16
128
129 static int ipgre_net_id;
130 struct ipgre_net {
131         struct ip_tunnel *tunnels[4][HASH_SIZE];
132
133         struct net_device *fb_tunnel_dev;
134 };
135
136 /* Tunnel hash table */
137
138 /*
139    4 hash tables:
140
141    3: (remote,local)
142    2: (remote,*)
143    1: (*,local)
144    0: (*,*)
145
146    We require exact key match i.e. if a key is present in packet
147    it will match only tunnel with the same key; if it is not present,
148    it will match only keyless tunnel.
149
150    All keysless packets, if not matched configured keyless tunnels
151    will match fallback tunnel.
152  */
153
154 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
155
156 #define tunnels_r_l     tunnels[3]
157 #define tunnels_r       tunnels[2]
158 #define tunnels_l       tunnels[1]
159 #define tunnels_wc      tunnels[0]
160
161 static DEFINE_RWLOCK(ipgre_lock);
162
163 /* Given src, dst and key, find appropriate for input tunnel. */
164
165 static struct ip_tunnel * ipgre_tunnel_lookup(struct net *net,
166                 __be32 remote, __be32 local, __be32 key)
167 {
168         unsigned h0 = HASH(remote);
169         unsigned h1 = HASH(key);
170         struct ip_tunnel *t;
171         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
172
173         for (t = ign->tunnels_r_l[h0^h1]; t; t = t->next) {
174                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
175                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
176                                 return t;
177                 }
178         }
179         for (t = ign->tunnels_r[h0^h1]; t; t = t->next) {
180                 if (remote == t->parms.iph.daddr) {
181                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
182                                 return t;
183                 }
184         }
185         for (t = ign->tunnels_l[h1]; t; t = t->next) {
186                 if (local == t->parms.iph.saddr ||
187                      (local == t->parms.iph.daddr &&
188                       ipv4_is_multicast(local))) {
189                         if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
190                                 return t;
191                 }
192         }
193         for (t = ign->tunnels_wc[h1]; t; t = t->next) {
194                 if (t->parms.i_key == key && (t->dev->flags&IFF_UP))
195                         return t;
196         }
197
198         if (ign->fb_tunnel_dev->flags&IFF_UP)
199                 return netdev_priv(ign->fb_tunnel_dev);
200         return NULL;
201 }
202
203 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
204                 struct ip_tunnel_parm *parms)
205 {
206         __be32 remote = parms->iph.daddr;
207         __be32 local = parms->iph.saddr;
208         __be32 key = parms->i_key;
209         unsigned h = HASH(key);
210         int prio = 0;
211
212         if (local)
213                 prio |= 1;
214         if (remote && !ipv4_is_multicast(remote)) {
215                 prio |= 2;
216                 h ^= HASH(remote);
217         }
218
219         return &ign->tunnels[prio][h];
220 }
221
222 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
223                 struct ip_tunnel *t)
224 {
225         return __ipgre_bucket(ign, &t->parms);
226 }
227
228 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
229 {
230         struct ip_tunnel **tp = ipgre_bucket(ign, t);
231
232         t->next = *tp;
233         write_lock_bh(&ipgre_lock);
234         *tp = t;
235         write_unlock_bh(&ipgre_lock);
236 }
237
238 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
239 {
240         struct ip_tunnel **tp;
241
242         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
243                 if (t == *tp) {
244                         write_lock_bh(&ipgre_lock);
245                         *tp = t->next;
246                         write_unlock_bh(&ipgre_lock);
247                         break;
248                 }
249         }
250 }
251
252 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
253                 struct ip_tunnel_parm *parms, int create)
254 {
255         __be32 remote = parms->iph.daddr;
256         __be32 local = parms->iph.saddr;
257         __be32 key = parms->i_key;
258         struct ip_tunnel *t, **tp, *nt;
259         struct net_device *dev;
260         char name[IFNAMSIZ];
261         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
262
263         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) {
264                 if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) {
265                         if (key == t->parms.i_key)
266                                 return t;
267                 }
268         }
269         if (!create)
270                 return NULL;
271
272         if (parms->name[0])
273                 strlcpy(name, parms->name, IFNAMSIZ);
274         else
275                 sprintf(name, "gre%%d");
276
277         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
278         if (!dev)
279           return NULL;
280
281         dev_net_set(dev, net);
282
283         if (strchr(name, '%')) {
284                 if (dev_alloc_name(dev, name) < 0)
285                         goto failed_free;
286         }
287
288         dev->init = ipgre_tunnel_init;
289         nt = netdev_priv(dev);
290         nt->parms = *parms;
291
292         if (register_netdevice(dev) < 0)
293                 goto failed_free;
294
295         dev_hold(dev);
296         ipgre_tunnel_link(ign, nt);
297         return nt;
298
299 failed_free:
300         free_netdev(dev);
301         return NULL;
302 }
303
304 static void ipgre_tunnel_uninit(struct net_device *dev)
305 {
306         struct net *net = dev_net(dev);
307         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
308
309         ipgre_tunnel_unlink(ign, netdev_priv(dev));
310         dev_put(dev);
311 }
312
313
314 static void ipgre_err(struct sk_buff *skb, u32 info)
315 {
316
317 /* All the routers (except for Linux) return only
318    8 bytes of packet payload. It means, that precise relaying of
319    ICMP in the real Internet is absolutely infeasible.
320
321    Moreover, Cisco "wise men" put GRE key to the third word
322    in GRE header. It makes impossible maintaining even soft state for keyed
323    GRE tunnels with enabled checksum. Tell them "thank you".
324
325    Well, I wonder, rfc1812 was written by Cisco employee,
326    what the hell these idiots break standrads established
327    by themself???
328  */
329
330         struct iphdr *iph = (struct iphdr*)skb->data;
331         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
332         int grehlen = (iph->ihl<<2) + 4;
333         const int type = icmp_hdr(skb)->type;
334         const int code = icmp_hdr(skb)->code;
335         struct ip_tunnel *t;
336         __be16 flags;
337
338         flags = p[0];
339         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
340                 if (flags&(GRE_VERSION|GRE_ROUTING))
341                         return;
342                 if (flags&GRE_KEY) {
343                         grehlen += 4;
344                         if (flags&GRE_CSUM)
345                                 grehlen += 4;
346                 }
347         }
348
349         /* If only 8 bytes returned, keyed message will be dropped here */
350         if (skb_headlen(skb) < grehlen)
351                 return;
352
353         switch (type) {
354         default:
355         case ICMP_PARAMETERPROB:
356                 return;
357
358         case ICMP_DEST_UNREACH:
359                 switch (code) {
360                 case ICMP_SR_FAILED:
361                 case ICMP_PORT_UNREACH:
362                         /* Impossible event. */
363                         return;
364                 case ICMP_FRAG_NEEDED:
365                         /* Soft state for pmtu is maintained by IP core. */
366                         return;
367                 default:
368                         /* All others are translated to HOST_UNREACH.
369                            rfc2003 contains "deep thoughts" about NET_UNREACH,
370                            I believe they are just ether pollution. --ANK
371                          */
372                         break;
373                 }
374                 break;
375         case ICMP_TIME_EXCEEDED:
376                 if (code != ICMP_EXC_TTL)
377                         return;
378                 break;
379         }
380
381         read_lock(&ipgre_lock);
382         t = ipgre_tunnel_lookup(dev_net(skb->dev), iph->daddr, iph->saddr,
383                         (flags&GRE_KEY) ?
384                         *(((__be32*)p) + (grehlen>>2) - 1) : 0);
385         if (t == NULL || t->parms.iph.daddr == 0 ||
386             ipv4_is_multicast(t->parms.iph.daddr))
387                 goto out;
388
389         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
390                 goto out;
391
392         if (jiffies - t->err_time < IPTUNNEL_ERR_TIMEO)
393                 t->err_count++;
394         else
395                 t->err_count = 1;
396         t->err_time = jiffies;
397 out:
398         read_unlock(&ipgre_lock);
399         return;
400 }
401
402 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
403 {
404         if (INET_ECN_is_ce(iph->tos)) {
405                 if (skb->protocol == htons(ETH_P_IP)) {
406                         IP_ECN_set_ce(ip_hdr(skb));
407                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
408                         IP6_ECN_set_ce(ipv6_hdr(skb));
409                 }
410         }
411 }
412
413 static inline u8
414 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
415 {
416         u8 inner = 0;
417         if (skb->protocol == htons(ETH_P_IP))
418                 inner = old_iph->tos;
419         else if (skb->protocol == htons(ETH_P_IPV6))
420                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
421         return INET_ECN_encapsulate(tos, inner);
422 }
423
424 static int ipgre_rcv(struct sk_buff *skb)
425 {
426         struct iphdr *iph;
427         u8     *h;
428         __be16    flags;
429         __sum16   csum = 0;
430         __be32 key = 0;
431         u32    seqno = 0;
432         struct ip_tunnel *tunnel;
433         int    offset = 4;
434
435         if (!pskb_may_pull(skb, 16))
436                 goto drop_nolock;
437
438         iph = ip_hdr(skb);
439         h = skb->data;
440         flags = *(__be16*)h;
441
442         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
443                 /* - Version must be 0.
444                    - We do not support routing headers.
445                  */
446                 if (flags&(GRE_VERSION|GRE_ROUTING))
447                         goto drop_nolock;
448
449                 if (flags&GRE_CSUM) {
450                         switch (skb->ip_summed) {
451                         case CHECKSUM_COMPLETE:
452                                 csum = csum_fold(skb->csum);
453                                 if (!csum)
454                                         break;
455                                 /* fall through */
456                         case CHECKSUM_NONE:
457                                 skb->csum = 0;
458                                 csum = __skb_checksum_complete(skb);
459                                 skb->ip_summed = CHECKSUM_COMPLETE;
460                         }
461                         offset += 4;
462                 }
463                 if (flags&GRE_KEY) {
464                         key = *(__be32*)(h + offset);
465                         offset += 4;
466                 }
467                 if (flags&GRE_SEQ) {
468                         seqno = ntohl(*(__be32*)(h + offset));
469                         offset += 4;
470                 }
471         }
472
473         read_lock(&ipgre_lock);
474         if ((tunnel = ipgre_tunnel_lookup(dev_net(skb->dev),
475                                         iph->saddr, iph->daddr, key)) != NULL) {
476                 secpath_reset(skb);
477
478                 skb->protocol = *(__be16*)(h + 2);
479                 /* WCCP version 1 and 2 protocol decoding.
480                  * - Change protocol to IP
481                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
482                  */
483                 if (flags == 0 &&
484                     skb->protocol == htons(ETH_P_WCCP)) {
485                         skb->protocol = htons(ETH_P_IP);
486                         if ((*(h + offset) & 0xF0) != 0x40)
487                                 offset += 4;
488                 }
489
490                 skb->mac_header = skb->network_header;
491                 __pskb_pull(skb, offset);
492                 skb_reset_network_header(skb);
493                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
494                 skb->pkt_type = PACKET_HOST;
495 #ifdef CONFIG_NET_IPGRE_BROADCAST
496                 if (ipv4_is_multicast(iph->daddr)) {
497                         /* Looped back packet, drop it! */
498                         if (skb->rtable->fl.iif == 0)
499                                 goto drop;
500                         tunnel->stat.multicast++;
501                         skb->pkt_type = PACKET_BROADCAST;
502                 }
503 #endif
504
505                 if (((flags&GRE_CSUM) && csum) ||
506                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
507                         tunnel->stat.rx_crc_errors++;
508                         tunnel->stat.rx_errors++;
509                         goto drop;
510                 }
511                 if (tunnel->parms.i_flags&GRE_SEQ) {
512                         if (!(flags&GRE_SEQ) ||
513                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
514                                 tunnel->stat.rx_fifo_errors++;
515                                 tunnel->stat.rx_errors++;
516                                 goto drop;
517                         }
518                         tunnel->i_seqno = seqno + 1;
519                 }
520                 tunnel->stat.rx_packets++;
521                 tunnel->stat.rx_bytes += skb->len;
522                 skb->dev = tunnel->dev;
523                 dst_release(skb->dst);
524                 skb->dst = NULL;
525                 nf_reset(skb);
526                 ipgre_ecn_decapsulate(iph, skb);
527                 netif_rx(skb);
528                 read_unlock(&ipgre_lock);
529                 return(0);
530         }
531         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
532
533 drop:
534         read_unlock(&ipgre_lock);
535 drop_nolock:
536         kfree_skb(skb);
537         return(0);
538 }
539
540 static int ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
541 {
542         struct ip_tunnel *tunnel = netdev_priv(dev);
543         struct net_device_stats *stats = &tunnel->stat;
544         struct iphdr  *old_iph = ip_hdr(skb);
545         struct iphdr  *tiph;
546         u8     tos;
547         __be16 df;
548         struct rtable *rt;                      /* Route to the other host */
549         struct net_device *tdev;                        /* Device to other host */
550         struct iphdr  *iph;                     /* Our new IP header */
551         unsigned int max_headroom;              /* The extra header space needed */
552         int    gre_hlen;
553         __be32 dst;
554         int    mtu;
555
556         if (tunnel->recursion++) {
557                 tunnel->stat.collisions++;
558                 goto tx_error;
559         }
560
561         if (dev->header_ops) {
562                 gre_hlen = 0;
563                 tiph = (struct iphdr*)skb->data;
564         } else {
565                 gre_hlen = tunnel->hlen;
566                 tiph = &tunnel->parms.iph;
567         }
568
569         if ((dst = tiph->daddr) == 0) {
570                 /* NBMA tunnel */
571
572                 if (skb->dst == NULL) {
573                         tunnel->stat.tx_fifo_errors++;
574                         goto tx_error;
575                 }
576
577                 if (skb->protocol == htons(ETH_P_IP)) {
578                         rt = skb->rtable;
579                         if ((dst = rt->rt_gateway) == 0)
580                                 goto tx_error_icmp;
581                 }
582 #ifdef CONFIG_IPV6
583                 else if (skb->protocol == htons(ETH_P_IPV6)) {
584                         struct in6_addr *addr6;
585                         int addr_type;
586                         struct neighbour *neigh = skb->dst->neighbour;
587
588                         if (neigh == NULL)
589                                 goto tx_error;
590
591                         addr6 = (struct in6_addr*)&neigh->primary_key;
592                         addr_type = ipv6_addr_type(addr6);
593
594                         if (addr_type == IPV6_ADDR_ANY) {
595                                 addr6 = &ipv6_hdr(skb)->daddr;
596                                 addr_type = ipv6_addr_type(addr6);
597                         }
598
599                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
600                                 goto tx_error_icmp;
601
602                         dst = addr6->s6_addr32[3];
603                 }
604 #endif
605                 else
606                         goto tx_error;
607         }
608
609         tos = tiph->tos;
610         if (tos&1) {
611                 if (skb->protocol == htons(ETH_P_IP))
612                         tos = old_iph->tos;
613                 tos &= ~1;
614         }
615
616         {
617                 struct flowi fl = { .oif = tunnel->parms.link,
618                                     .nl_u = { .ip4_u =
619                                               { .daddr = dst,
620                                                 .saddr = tiph->saddr,
621                                                 .tos = RT_TOS(tos) } },
622                                     .proto = IPPROTO_GRE };
623                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
624                         tunnel->stat.tx_carrier_errors++;
625                         goto tx_error;
626                 }
627         }
628         tdev = rt->u.dst.dev;
629
630         if (tdev == dev) {
631                 ip_rt_put(rt);
632                 tunnel->stat.collisions++;
633                 goto tx_error;
634         }
635
636         df = tiph->frag_off;
637         if (df)
638                 mtu = dst_mtu(&rt->u.dst) - tunnel->hlen;
639         else
640                 mtu = skb->dst ? dst_mtu(skb->dst) : dev->mtu;
641
642         if (skb->dst)
643                 skb->dst->ops->update_pmtu(skb->dst, mtu);
644
645         if (skb->protocol == htons(ETH_P_IP)) {
646                 df |= (old_iph->frag_off&htons(IP_DF));
647
648                 if ((old_iph->frag_off&htons(IP_DF)) &&
649                     mtu < ntohs(old_iph->tot_len)) {
650                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
651                         ip_rt_put(rt);
652                         goto tx_error;
653                 }
654         }
655 #ifdef CONFIG_IPV6
656         else if (skb->protocol == htons(ETH_P_IPV6)) {
657                 struct rt6_info *rt6 = (struct rt6_info*)skb->dst;
658
659                 if (rt6 && mtu < dst_mtu(skb->dst) && mtu >= IPV6_MIN_MTU) {
660                         if ((tunnel->parms.iph.daddr &&
661                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
662                             rt6->rt6i_dst.plen == 128) {
663                                 rt6->rt6i_flags |= RTF_MODIFIED;
664                                 skb->dst->metrics[RTAX_MTU-1] = mtu;
665                         }
666                 }
667
668                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
669                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
670                         ip_rt_put(rt);
671                         goto tx_error;
672                 }
673         }
674 #endif
675
676         if (tunnel->err_count > 0) {
677                 if (jiffies - tunnel->err_time < IPTUNNEL_ERR_TIMEO) {
678                         tunnel->err_count--;
679
680                         dst_link_failure(skb);
681                 } else
682                         tunnel->err_count = 0;
683         }
684
685         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
686
687         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
688             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
689                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
690                 if (!new_skb) {
691                         ip_rt_put(rt);
692                         stats->tx_dropped++;
693                         dev_kfree_skb(skb);
694                         tunnel->recursion--;
695                         return 0;
696                 }
697                 if (skb->sk)
698                         skb_set_owner_w(new_skb, skb->sk);
699                 dev_kfree_skb(skb);
700                 skb = new_skb;
701                 old_iph = ip_hdr(skb);
702         }
703
704         skb->transport_header = skb->network_header;
705         skb_push(skb, gre_hlen);
706         skb_reset_network_header(skb);
707         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
708         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
709                               IPSKB_REROUTED);
710         dst_release(skb->dst);
711         skb->dst = &rt->u.dst;
712
713         /*
714          *      Push down and install the IPIP header.
715          */
716
717         iph                     =       ip_hdr(skb);
718         iph->version            =       4;
719         iph->ihl                =       sizeof(struct iphdr) >> 2;
720         iph->frag_off           =       df;
721         iph->protocol           =       IPPROTO_GRE;
722         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
723         iph->daddr              =       rt->rt_dst;
724         iph->saddr              =       rt->rt_src;
725
726         if ((iph->ttl = tiph->ttl) == 0) {
727                 if (skb->protocol == htons(ETH_P_IP))
728                         iph->ttl = old_iph->ttl;
729 #ifdef CONFIG_IPV6
730                 else if (skb->protocol == htons(ETH_P_IPV6))
731                         iph->ttl = ((struct ipv6hdr*)old_iph)->hop_limit;
732 #endif
733                 else
734                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
735         }
736
737         ((__be16*)(iph+1))[0] = tunnel->parms.o_flags;
738         ((__be16*)(iph+1))[1] = skb->protocol;
739
740         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
741                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
742
743                 if (tunnel->parms.o_flags&GRE_SEQ) {
744                         ++tunnel->o_seqno;
745                         *ptr = htonl(tunnel->o_seqno);
746                         ptr--;
747                 }
748                 if (tunnel->parms.o_flags&GRE_KEY) {
749                         *ptr = tunnel->parms.o_key;
750                         ptr--;
751                 }
752                 if (tunnel->parms.o_flags&GRE_CSUM) {
753                         *ptr = 0;
754                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
755                 }
756         }
757
758         nf_reset(skb);
759
760         IPTUNNEL_XMIT();
761         tunnel->recursion--;
762         return 0;
763
764 tx_error_icmp:
765         dst_link_failure(skb);
766
767 tx_error:
768         stats->tx_errors++;
769         dev_kfree_skb(skb);
770         tunnel->recursion--;
771         return 0;
772 }
773
774 static void ipgre_tunnel_bind_dev(struct net_device *dev)
775 {
776         struct net_device *tdev = NULL;
777         struct ip_tunnel *tunnel;
778         struct iphdr *iph;
779         int hlen = LL_MAX_HEADER;
780         int mtu = ETH_DATA_LEN;
781         int addend = sizeof(struct iphdr) + 4;
782
783         tunnel = netdev_priv(dev);
784         iph = &tunnel->parms.iph;
785
786         /* Guess output device to choose reasonable mtu and hard_header_len */
787
788         if (iph->daddr) {
789                 struct flowi fl = { .oif = tunnel->parms.link,
790                                     .nl_u = { .ip4_u =
791                                               { .daddr = iph->daddr,
792                                                 .saddr = iph->saddr,
793                                                 .tos = RT_TOS(iph->tos) } },
794                                     .proto = IPPROTO_GRE };
795                 struct rtable *rt;
796                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
797                         tdev = rt->u.dst.dev;
798                         ip_rt_put(rt);
799                 }
800                 dev->flags |= IFF_POINTOPOINT;
801         }
802
803         if (!tdev && tunnel->parms.link)
804                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
805
806         if (tdev) {
807                 hlen = tdev->hard_header_len;
808                 mtu = tdev->mtu;
809         }
810         dev->iflink = tunnel->parms.link;
811
812         /* Precalculate GRE options length */
813         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
814                 if (tunnel->parms.o_flags&GRE_CSUM)
815                         addend += 4;
816                 if (tunnel->parms.o_flags&GRE_KEY)
817                         addend += 4;
818                 if (tunnel->parms.o_flags&GRE_SEQ)
819                         addend += 4;
820         }
821         dev->hard_header_len = hlen + addend;
822         dev->mtu = mtu - addend;
823         tunnel->hlen = addend;
824
825 }
826
827 static int
828 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
829 {
830         int err = 0;
831         struct ip_tunnel_parm p;
832         struct ip_tunnel *t;
833         struct net *net = dev_net(dev);
834         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
835
836         switch (cmd) {
837         case SIOCGETTUNNEL:
838                 t = NULL;
839                 if (dev == ign->fb_tunnel_dev) {
840                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
841                                 err = -EFAULT;
842                                 break;
843                         }
844                         t = ipgre_tunnel_locate(net, &p, 0);
845                 }
846                 if (t == NULL)
847                         t = netdev_priv(dev);
848                 memcpy(&p, &t->parms, sizeof(p));
849                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
850                         err = -EFAULT;
851                 break;
852
853         case SIOCADDTUNNEL:
854         case SIOCCHGTUNNEL:
855                 err = -EPERM;
856                 if (!capable(CAP_NET_ADMIN))
857                         goto done;
858
859                 err = -EFAULT;
860                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
861                         goto done;
862
863                 err = -EINVAL;
864                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
865                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
866                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
867                         goto done;
868                 if (p.iph.ttl)
869                         p.iph.frag_off |= htons(IP_DF);
870
871                 if (!(p.i_flags&GRE_KEY))
872                         p.i_key = 0;
873                 if (!(p.o_flags&GRE_KEY))
874                         p.o_key = 0;
875
876                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
877
878                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
879                         if (t != NULL) {
880                                 if (t->dev != dev) {
881                                         err = -EEXIST;
882                                         break;
883                                 }
884                         } else {
885                                 unsigned nflags=0;
886
887                                 t = netdev_priv(dev);
888
889                                 if (ipv4_is_multicast(p.iph.daddr))
890                                         nflags = IFF_BROADCAST;
891                                 else if (p.iph.daddr)
892                                         nflags = IFF_POINTOPOINT;
893
894                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
895                                         err = -EINVAL;
896                                         break;
897                                 }
898                                 ipgre_tunnel_unlink(ign, t);
899                                 t->parms.iph.saddr = p.iph.saddr;
900                                 t->parms.iph.daddr = p.iph.daddr;
901                                 t->parms.i_key = p.i_key;
902                                 t->parms.o_key = p.o_key;
903                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
904                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
905                                 ipgre_tunnel_link(ign, t);
906                                 netdev_state_change(dev);
907                         }
908                 }
909
910                 if (t) {
911                         err = 0;
912                         if (cmd == SIOCCHGTUNNEL) {
913                                 t->parms.iph.ttl = p.iph.ttl;
914                                 t->parms.iph.tos = p.iph.tos;
915                                 t->parms.iph.frag_off = p.iph.frag_off;
916                                 if (t->parms.link != p.link) {
917                                         t->parms.link = p.link;
918                                         ipgre_tunnel_bind_dev(dev);
919                                         netdev_state_change(dev);
920                                 }
921                         }
922                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
923                                 err = -EFAULT;
924                 } else
925                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
926                 break;
927
928         case SIOCDELTUNNEL:
929                 err = -EPERM;
930                 if (!capable(CAP_NET_ADMIN))
931                         goto done;
932
933                 if (dev == ign->fb_tunnel_dev) {
934                         err = -EFAULT;
935                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
936                                 goto done;
937                         err = -ENOENT;
938                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
939                                 goto done;
940                         err = -EPERM;
941                         if (t == netdev_priv(ign->fb_tunnel_dev))
942                                 goto done;
943                         dev = t->dev;
944                 }
945                 unregister_netdevice(dev);
946                 err = 0;
947                 break;
948
949         default:
950                 err = -EINVAL;
951         }
952
953 done:
954         return err;
955 }
956
957 static struct net_device_stats *ipgre_tunnel_get_stats(struct net_device *dev)
958 {
959         return &(((struct ip_tunnel*)netdev_priv(dev))->stat);
960 }
961
962 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
963 {
964         struct ip_tunnel *tunnel = netdev_priv(dev);
965         if (new_mtu < 68 || new_mtu > 0xFFF8 - tunnel->hlen)
966                 return -EINVAL;
967         dev->mtu = new_mtu;
968         return 0;
969 }
970
971 /* Nice toy. Unfortunately, useless in real life :-)
972    It allows to construct virtual multiprotocol broadcast "LAN"
973    over the Internet, provided multicast routing is tuned.
974
975
976    I have no idea was this bicycle invented before me,
977    so that I had to set ARPHRD_IPGRE to a random value.
978    I have an impression, that Cisco could make something similar,
979    but this feature is apparently missing in IOS<=11.2(8).
980
981    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
982    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
983
984    ping -t 255 224.66.66.66
985
986    If nobody answers, mbone does not work.
987
988    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
989    ip addr add 10.66.66.<somewhat>/24 dev Universe
990    ifconfig Universe up
991    ifconfig Universe add fe80::<Your_real_addr>/10
992    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
993    ftp 10.66.66.66
994    ...
995    ftp fec0:6666:6666::193.233.7.65
996    ...
997
998  */
999
1000 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1001                         unsigned short type,
1002                         const void *daddr, const void *saddr, unsigned len)
1003 {
1004         struct ip_tunnel *t = netdev_priv(dev);
1005         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1006         __be16 *p = (__be16*)(iph+1);
1007
1008         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1009         p[0]            = t->parms.o_flags;
1010         p[1]            = htons(type);
1011
1012         /*
1013          *      Set the source hardware address.
1014          */
1015
1016         if (saddr)
1017                 memcpy(&iph->saddr, saddr, 4);
1018
1019         if (daddr) {
1020                 memcpy(&iph->daddr, daddr, 4);
1021                 return t->hlen;
1022         }
1023         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1024                 return t->hlen;
1025
1026         return -t->hlen;
1027 }
1028
1029 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1030 {
1031         struct iphdr *iph = (struct iphdr*) skb_mac_header(skb);
1032         memcpy(haddr, &iph->saddr, 4);
1033         return 4;
1034 }
1035
1036 static const struct header_ops ipgre_header_ops = {
1037         .create = ipgre_header,
1038         .parse  = ipgre_header_parse,
1039 };
1040
1041 #ifdef CONFIG_NET_IPGRE_BROADCAST
1042 static int ipgre_open(struct net_device *dev)
1043 {
1044         struct ip_tunnel *t = netdev_priv(dev);
1045
1046         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1047                 struct flowi fl = { .oif = t->parms.link,
1048                                     .nl_u = { .ip4_u =
1049                                               { .daddr = t->parms.iph.daddr,
1050                                                 .saddr = t->parms.iph.saddr,
1051                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1052                                     .proto = IPPROTO_GRE };
1053                 struct rtable *rt;
1054                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1055                         return -EADDRNOTAVAIL;
1056                 dev = rt->u.dst.dev;
1057                 ip_rt_put(rt);
1058                 if (__in_dev_get_rtnl(dev) == NULL)
1059                         return -EADDRNOTAVAIL;
1060                 t->mlink = dev->ifindex;
1061                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1062         }
1063         return 0;
1064 }
1065
1066 static int ipgre_close(struct net_device *dev)
1067 {
1068         struct ip_tunnel *t = netdev_priv(dev);
1069         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1070                 struct in_device *in_dev;
1071                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1072                 if (in_dev) {
1073                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1074                         in_dev_put(in_dev);
1075                 }
1076         }
1077         return 0;
1078 }
1079
1080 #endif
1081
1082 static void ipgre_tunnel_setup(struct net_device *dev)
1083 {
1084         dev->uninit             = ipgre_tunnel_uninit;
1085         dev->destructor         = free_netdev;
1086         dev->hard_start_xmit    = ipgre_tunnel_xmit;
1087         dev->get_stats          = ipgre_tunnel_get_stats;
1088         dev->do_ioctl           = ipgre_tunnel_ioctl;
1089         dev->change_mtu         = ipgre_tunnel_change_mtu;
1090
1091         dev->type               = ARPHRD_IPGRE;
1092         dev->hard_header_len    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1093         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1094         dev->flags              = IFF_NOARP;
1095         dev->iflink             = 0;
1096         dev->addr_len           = 4;
1097         dev->features           |= NETIF_F_NETNS_LOCAL;
1098 }
1099
1100 static int ipgre_tunnel_init(struct net_device *dev)
1101 {
1102         struct ip_tunnel *tunnel;
1103         struct iphdr *iph;
1104
1105         tunnel = netdev_priv(dev);
1106         iph = &tunnel->parms.iph;
1107
1108         tunnel->dev = dev;
1109         strcpy(tunnel->parms.name, dev->name);
1110
1111         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1112         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1113
1114         ipgre_tunnel_bind_dev(dev);
1115
1116         if (iph->daddr) {
1117 #ifdef CONFIG_NET_IPGRE_BROADCAST
1118                 if (ipv4_is_multicast(iph->daddr)) {
1119                         if (!iph->saddr)
1120                                 return -EINVAL;
1121                         dev->flags = IFF_BROADCAST;
1122                         dev->header_ops = &ipgre_header_ops;
1123                         dev->open = ipgre_open;
1124                         dev->stop = ipgre_close;
1125                 }
1126 #endif
1127         } else
1128                 dev->header_ops = &ipgre_header_ops;
1129
1130         return 0;
1131 }
1132
1133 static int ipgre_fb_tunnel_init(struct net_device *dev)
1134 {
1135         struct ip_tunnel *tunnel = netdev_priv(dev);
1136         struct iphdr *iph = &tunnel->parms.iph;
1137         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1138
1139         tunnel->dev = dev;
1140         strcpy(tunnel->parms.name, dev->name);
1141
1142         iph->version            = 4;
1143         iph->protocol           = IPPROTO_GRE;
1144         iph->ihl                = 5;
1145         tunnel->hlen            = sizeof(struct iphdr) + 4;
1146
1147         dev_hold(dev);
1148         ign->tunnels_wc[0]      = tunnel;
1149         return 0;
1150 }
1151
1152
1153 static struct net_protocol ipgre_protocol = {
1154         .handler        =       ipgre_rcv,
1155         .err_handler    =       ipgre_err,
1156         .netns_ok       =       1,
1157 };
1158
1159 static void ipgre_destroy_tunnels(struct ipgre_net *ign)
1160 {
1161         int prio;
1162
1163         for (prio = 0; prio < 4; prio++) {
1164                 int h;
1165                 for (h = 0; h < HASH_SIZE; h++) {
1166                         struct ip_tunnel *t;
1167                         while ((t = ign->tunnels[prio][h]) != NULL)
1168                                 unregister_netdevice(t->dev);
1169                 }
1170         }
1171 }
1172
1173 static int ipgre_init_net(struct net *net)
1174 {
1175         int err;
1176         struct ipgre_net *ign;
1177
1178         err = -ENOMEM;
1179         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1180         if (ign == NULL)
1181                 goto err_alloc;
1182
1183         err = net_assign_generic(net, ipgre_net_id, ign);
1184         if (err < 0)
1185                 goto err_assign;
1186
1187         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1188                                            ipgre_tunnel_setup);
1189         if (!ign->fb_tunnel_dev) {
1190                 err = -ENOMEM;
1191                 goto err_alloc_dev;
1192         }
1193
1194         ign->fb_tunnel_dev->init = ipgre_fb_tunnel_init;
1195         dev_net_set(ign->fb_tunnel_dev, net);
1196
1197         if ((err = register_netdev(ign->fb_tunnel_dev)))
1198                 goto err_reg_dev;
1199
1200         return 0;
1201
1202 err_reg_dev:
1203         free_netdev(ign->fb_tunnel_dev);
1204 err_alloc_dev:
1205         /* nothing */
1206 err_assign:
1207         kfree(ign);
1208 err_alloc:
1209         return err;
1210 }
1211
1212 static void ipgre_exit_net(struct net *net)
1213 {
1214         struct ipgre_net *ign;
1215
1216         ign = net_generic(net, ipgre_net_id);
1217         rtnl_lock();
1218         ipgre_destroy_tunnels(ign);
1219         rtnl_unlock();
1220         kfree(ign);
1221 }
1222
1223 static struct pernet_operations ipgre_net_ops = {
1224         .init = ipgre_init_net,
1225         .exit = ipgre_exit_net,
1226 };
1227
1228 /*
1229  *      And now the modules code and kernel interface.
1230  */
1231
1232 static int __init ipgre_init(void)
1233 {
1234         int err;
1235
1236         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1237
1238         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1239                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1240                 return -EAGAIN;
1241         }
1242
1243         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1244         if (err < 0)
1245                 inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1246
1247         return err;
1248 }
1249
1250 static void __exit ipgre_fini(void)
1251 {
1252         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1253                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1254
1255         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1256 }
1257
1258 module_init(ipgre_init);
1259 module_exit(ipgre_fini);
1260 MODULE_LICENSE("GPL");