]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv4/ip_gre.c
Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/linville/wirel...
[net-next-2.6.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 #include <net/gre.h>
48
49 #ifdef CONFIG_IPV6
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54
55 /*
56    Problems & solutions
57    --------------------
58
59    1. The most important issue is detecting local dead loops.
60    They would cause complete host lockup in transmit, which
61    would be "resolved" by stack overflow or, if queueing is enabled,
62    with infinite looping in net_bh.
63
64    We cannot track such dead loops during route installation,
65    it is infeasible task. The most general solutions would be
66    to keep skb->encapsulation counter (sort of local ttl),
67    and silently drop packet when it expires. It is the best
68    solution, but it supposes maintaing new variable in ALL
69    skb, even if no tunneling is used.
70
71    Current solution: HARD_TX_LOCK lock breaks dead loops.
72
73
74
75    2. Networking dead loops would not kill routers, but would really
76    kill network. IP hop limit plays role of "t->recursion" in this case,
77    if we copy it from packet being encapsulated to upper header.
78    It is very good solution, but it introduces two problems:
79
80    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81      do not work over tunnels.
82    - traceroute does not work. I planned to relay ICMP from tunnel,
83      so that this problem would be solved and traceroute output
84      would even more informative. This idea appeared to be wrong:
85      only Linux complies to rfc1812 now (yes, guys, Linux is the only
86      true router now :-)), all routers (at least, in neighbourhood of mine)
87      return only 8 bytes of payload. It is the end.
88
89    Hence, if we want that OSPF worked or traceroute said something reasonable,
90    we should search for another solution.
91
92    One of them is to parse packet trying to detect inner encapsulation
93    made by our node. It is difficult or even impossible, especially,
94    taking into account fragmentation. TO be short, tt is not solution at all.
95
96    Current solution: The solution was UNEXPECTEDLY SIMPLE.
97    We force DF flag on tunnels with preconfigured hop limit,
98    that is ALL. :-) Well, it does not remove the problem completely,
99    but exponential growth of network traffic is changed to linear
100    (branches, that exceed pmtu are pruned) and tunnel mtu
101    fastly degrades to value <68, where looping stops.
102    Yes, it is not good if there exists a router in the loop,
103    which does not force DF, even when encapsulating packets have DF set.
104    But it is not our problem! Nobody could accuse us, we made
105    all that we could make. Even if it is your gated who injected
106    fatal route to network, even if it were you who configured
107    fatal static route: you are innocent. :-)
108
109
110
111    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112    practically identical code. It would be good to glue them
113    together, but it is not very evident, how to make them modular.
114    sit is integral part of IPv6, ipip and gre are naturally modular.
115    We could extract common parts (hash table, ioctl etc)
116    to a separate module (ip_tunnel.c).
117
118    Alexey Kuznetsov.
119  */
120
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125
126 /* Fallback tunnel: no source, no destination, no key, no options */
127
128 #define HASH_SIZE  16
129
130 static int ipgre_net_id __read_mostly;
131 struct ipgre_net {
132         struct ip_tunnel *tunnels[4][HASH_SIZE];
133
134         struct net_device *fb_tunnel_dev;
135 };
136
137 /* Tunnel hash table */
138
139 /*
140    4 hash tables:
141
142    3: (remote,local)
143    2: (remote,*)
144    1: (*,local)
145    0: (*,*)
146
147    We require exact key match i.e. if a key is present in packet
148    it will match only tunnel with the same key; if it is not present,
149    it will match only keyless tunnel.
150
151    All keysless packets, if not matched configured keyless tunnels
152    will match fallback tunnel.
153  */
154
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156
157 #define tunnels_r_l     tunnels[3]
158 #define tunnels_r       tunnels[2]
159 #define tunnels_l       tunnels[1]
160 #define tunnels_wc      tunnels[0]
161 /*
162  * Locking : hash tables are protected by RCU and a spinlock
163  */
164 static DEFINE_SPINLOCK(ipgre_lock);
165
166 #define for_each_ip_tunnel_rcu(start) \
167         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
168
169 /* Given src, dst and key, find appropriate for input tunnel. */
170
171 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
172                                               __be32 remote, __be32 local,
173                                               __be32 key, __be16 gre_proto)
174 {
175         struct net *net = dev_net(dev);
176         int link = dev->ifindex;
177         unsigned h0 = HASH(remote);
178         unsigned h1 = HASH(key);
179         struct ip_tunnel *t, *cand = NULL;
180         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
181         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
182                        ARPHRD_ETHER : ARPHRD_IPGRE;
183         int score, cand_score = 4;
184
185         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
186                 if (local != t->parms.iph.saddr ||
187                     remote != t->parms.iph.daddr ||
188                     key != t->parms.i_key ||
189                     !(t->dev->flags & IFF_UP))
190                         continue;
191
192                 if (t->dev->type != ARPHRD_IPGRE &&
193                     t->dev->type != dev_type)
194                         continue;
195
196                 score = 0;
197                 if (t->parms.link != link)
198                         score |= 1;
199                 if (t->dev->type != dev_type)
200                         score |= 2;
201                 if (score == 0)
202                         return t;
203
204                 if (score < cand_score) {
205                         cand = t;
206                         cand_score = score;
207                 }
208         }
209
210         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
211                 if (remote != t->parms.iph.daddr ||
212                     key != t->parms.i_key ||
213                     !(t->dev->flags & IFF_UP))
214                         continue;
215
216                 if (t->dev->type != ARPHRD_IPGRE &&
217                     t->dev->type != dev_type)
218                         continue;
219
220                 score = 0;
221                 if (t->parms.link != link)
222                         score |= 1;
223                 if (t->dev->type != dev_type)
224                         score |= 2;
225                 if (score == 0)
226                         return t;
227
228                 if (score < cand_score) {
229                         cand = t;
230                         cand_score = score;
231                 }
232         }
233
234         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
235                 if ((local != t->parms.iph.saddr &&
236                      (local != t->parms.iph.daddr ||
237                       !ipv4_is_multicast(local))) ||
238                     key != t->parms.i_key ||
239                     !(t->dev->flags & IFF_UP))
240                         continue;
241
242                 if (t->dev->type != ARPHRD_IPGRE &&
243                     t->dev->type != dev_type)
244                         continue;
245
246                 score = 0;
247                 if (t->parms.link != link)
248                         score |= 1;
249                 if (t->dev->type != dev_type)
250                         score |= 2;
251                 if (score == 0)
252                         return t;
253
254                 if (score < cand_score) {
255                         cand = t;
256                         cand_score = score;
257                 }
258         }
259
260         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
261                 if (t->parms.i_key != key ||
262                     !(t->dev->flags & IFF_UP))
263                         continue;
264
265                 if (t->dev->type != ARPHRD_IPGRE &&
266                     t->dev->type != dev_type)
267                         continue;
268
269                 score = 0;
270                 if (t->parms.link != link)
271                         score |= 1;
272                 if (t->dev->type != dev_type)
273                         score |= 2;
274                 if (score == 0)
275                         return t;
276
277                 if (score < cand_score) {
278                         cand = t;
279                         cand_score = score;
280                 }
281         }
282
283         if (cand != NULL)
284                 return cand;
285
286         dev = ign->fb_tunnel_dev;
287         if (dev->flags & IFF_UP)
288                 return netdev_priv(dev);
289
290         return NULL;
291 }
292
293 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
294                 struct ip_tunnel_parm *parms)
295 {
296         __be32 remote = parms->iph.daddr;
297         __be32 local = parms->iph.saddr;
298         __be32 key = parms->i_key;
299         unsigned h = HASH(key);
300         int prio = 0;
301
302         if (local)
303                 prio |= 1;
304         if (remote && !ipv4_is_multicast(remote)) {
305                 prio |= 2;
306                 h ^= HASH(remote);
307         }
308
309         return &ign->tunnels[prio][h];
310 }
311
312 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
313                 struct ip_tunnel *t)
314 {
315         return __ipgre_bucket(ign, &t->parms);
316 }
317
318 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
319 {
320         struct ip_tunnel **tp = ipgre_bucket(ign, t);
321
322         spin_lock_bh(&ipgre_lock);
323         t->next = *tp;
324         rcu_assign_pointer(*tp, t);
325         spin_unlock_bh(&ipgre_lock);
326 }
327
328 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
329 {
330         struct ip_tunnel **tp;
331
332         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
333                 if (t == *tp) {
334                         spin_lock_bh(&ipgre_lock);
335                         *tp = t->next;
336                         spin_unlock_bh(&ipgre_lock);
337                         break;
338                 }
339         }
340 }
341
342 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
343                                            struct ip_tunnel_parm *parms,
344                                            int type)
345 {
346         __be32 remote = parms->iph.daddr;
347         __be32 local = parms->iph.saddr;
348         __be32 key = parms->i_key;
349         int link = parms->link;
350         struct ip_tunnel *t, **tp;
351         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
352
353         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
354                 if (local == t->parms.iph.saddr &&
355                     remote == t->parms.iph.daddr &&
356                     key == t->parms.i_key &&
357                     link == t->parms.link &&
358                     type == t->dev->type)
359                         break;
360
361         return t;
362 }
363
364 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
365                 struct ip_tunnel_parm *parms, int create)
366 {
367         struct ip_tunnel *t, *nt;
368         struct net_device *dev;
369         char name[IFNAMSIZ];
370         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
371
372         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
373         if (t || !create)
374                 return t;
375
376         if (parms->name[0])
377                 strlcpy(name, parms->name, IFNAMSIZ);
378         else
379                 sprintf(name, "gre%%d");
380
381         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
382         if (!dev)
383           return NULL;
384
385         dev_net_set(dev, net);
386
387         if (strchr(name, '%')) {
388                 if (dev_alloc_name(dev, name) < 0)
389                         goto failed_free;
390         }
391
392         nt = netdev_priv(dev);
393         nt->parms = *parms;
394         dev->rtnl_link_ops = &ipgre_link_ops;
395
396         dev->mtu = ipgre_tunnel_bind_dev(dev);
397
398         if (register_netdevice(dev) < 0)
399                 goto failed_free;
400
401         dev_hold(dev);
402         ipgre_tunnel_link(ign, nt);
403         return nt;
404
405 failed_free:
406         free_netdev(dev);
407         return NULL;
408 }
409
410 static void ipgre_tunnel_uninit(struct net_device *dev)
411 {
412         struct net *net = dev_net(dev);
413         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
414
415         ipgre_tunnel_unlink(ign, netdev_priv(dev));
416         dev_put(dev);
417 }
418
419
420 static void ipgre_err(struct sk_buff *skb, u32 info)
421 {
422
423 /* All the routers (except for Linux) return only
424    8 bytes of packet payload. It means, that precise relaying of
425    ICMP in the real Internet is absolutely infeasible.
426
427    Moreover, Cisco "wise men" put GRE key to the third word
428    in GRE header. It makes impossible maintaining even soft state for keyed
429    GRE tunnels with enabled checksum. Tell them "thank you".
430
431    Well, I wonder, rfc1812 was written by Cisco employee,
432    what the hell these idiots break standrads established
433    by themself???
434  */
435
436         struct iphdr *iph = (struct iphdr *)skb->data;
437         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
438         int grehlen = (iph->ihl<<2) + 4;
439         const int type = icmp_hdr(skb)->type;
440         const int code = icmp_hdr(skb)->code;
441         struct ip_tunnel *t;
442         __be16 flags;
443
444         flags = p[0];
445         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
446                 if (flags&(GRE_VERSION|GRE_ROUTING))
447                         return;
448                 if (flags&GRE_KEY) {
449                         grehlen += 4;
450                         if (flags&GRE_CSUM)
451                                 grehlen += 4;
452                 }
453         }
454
455         /* If only 8 bytes returned, keyed message will be dropped here */
456         if (skb_headlen(skb) < grehlen)
457                 return;
458
459         switch (type) {
460         default:
461         case ICMP_PARAMETERPROB:
462                 return;
463
464         case ICMP_DEST_UNREACH:
465                 switch (code) {
466                 case ICMP_SR_FAILED:
467                 case ICMP_PORT_UNREACH:
468                         /* Impossible event. */
469                         return;
470                 case ICMP_FRAG_NEEDED:
471                         /* Soft state for pmtu is maintained by IP core. */
472                         return;
473                 default:
474                         /* All others are translated to HOST_UNREACH.
475                            rfc2003 contains "deep thoughts" about NET_UNREACH,
476                            I believe they are just ether pollution. --ANK
477                          */
478                         break;
479                 }
480                 break;
481         case ICMP_TIME_EXCEEDED:
482                 if (code != ICMP_EXC_TTL)
483                         return;
484                 break;
485         }
486
487         rcu_read_lock();
488         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
489                                 flags & GRE_KEY ?
490                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
491                                 p[1]);
492         if (t == NULL || t->parms.iph.daddr == 0 ||
493             ipv4_is_multicast(t->parms.iph.daddr))
494                 goto out;
495
496         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
497                 goto out;
498
499         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
500                 t->err_count++;
501         else
502                 t->err_count = 1;
503         t->err_time = jiffies;
504 out:
505         rcu_read_unlock();
506 }
507
508 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
509 {
510         if (INET_ECN_is_ce(iph->tos)) {
511                 if (skb->protocol == htons(ETH_P_IP)) {
512                         IP_ECN_set_ce(ip_hdr(skb));
513                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
514                         IP6_ECN_set_ce(ipv6_hdr(skb));
515                 }
516         }
517 }
518
519 static inline u8
520 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
521 {
522         u8 inner = 0;
523         if (skb->protocol == htons(ETH_P_IP))
524                 inner = old_iph->tos;
525         else if (skb->protocol == htons(ETH_P_IPV6))
526                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
527         return INET_ECN_encapsulate(tos, inner);
528 }
529
530 static int ipgre_rcv(struct sk_buff *skb)
531 {
532         struct iphdr *iph;
533         u8     *h;
534         __be16    flags;
535         __sum16   csum = 0;
536         __be32 key = 0;
537         u32    seqno = 0;
538         struct ip_tunnel *tunnel;
539         int    offset = 4;
540         __be16 gre_proto;
541
542         if (!pskb_may_pull(skb, 16))
543                 goto drop_nolock;
544
545         iph = ip_hdr(skb);
546         h = skb->data;
547         flags = *(__be16*)h;
548
549         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
550                 /* - Version must be 0.
551                    - We do not support routing headers.
552                  */
553                 if (flags&(GRE_VERSION|GRE_ROUTING))
554                         goto drop_nolock;
555
556                 if (flags&GRE_CSUM) {
557                         switch (skb->ip_summed) {
558                         case CHECKSUM_COMPLETE:
559                                 csum = csum_fold(skb->csum);
560                                 if (!csum)
561                                         break;
562                                 /* fall through */
563                         case CHECKSUM_NONE:
564                                 skb->csum = 0;
565                                 csum = __skb_checksum_complete(skb);
566                                 skb->ip_summed = CHECKSUM_COMPLETE;
567                         }
568                         offset += 4;
569                 }
570                 if (flags&GRE_KEY) {
571                         key = *(__be32*)(h + offset);
572                         offset += 4;
573                 }
574                 if (flags&GRE_SEQ) {
575                         seqno = ntohl(*(__be32*)(h + offset));
576                         offset += 4;
577                 }
578         }
579
580         gre_proto = *(__be16 *)(h + 2);
581
582         rcu_read_lock();
583         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
584                                           iph->saddr, iph->daddr, key,
585                                           gre_proto))) {
586                 struct net_device_stats *stats = &tunnel->dev->stats;
587
588                 secpath_reset(skb);
589
590                 skb->protocol = gre_proto;
591                 /* WCCP version 1 and 2 protocol decoding.
592                  * - Change protocol to IP
593                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
594                  */
595                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
596                         skb->protocol = htons(ETH_P_IP);
597                         if ((*(h + offset) & 0xF0) != 0x40)
598                                 offset += 4;
599                 }
600
601                 skb->mac_header = skb->network_header;
602                 __pskb_pull(skb, offset);
603                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
604                 skb->pkt_type = PACKET_HOST;
605 #ifdef CONFIG_NET_IPGRE_BROADCAST
606                 if (ipv4_is_multicast(iph->daddr)) {
607                         /* Looped back packet, drop it! */
608                         if (skb_rtable(skb)->fl.iif == 0)
609                                 goto drop;
610                         stats->multicast++;
611                         skb->pkt_type = PACKET_BROADCAST;
612                 }
613 #endif
614
615                 if (((flags&GRE_CSUM) && csum) ||
616                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
617                         stats->rx_crc_errors++;
618                         stats->rx_errors++;
619                         goto drop;
620                 }
621                 if (tunnel->parms.i_flags&GRE_SEQ) {
622                         if (!(flags&GRE_SEQ) ||
623                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
624                                 stats->rx_fifo_errors++;
625                                 stats->rx_errors++;
626                                 goto drop;
627                         }
628                         tunnel->i_seqno = seqno + 1;
629                 }
630
631                 /* Warning: All skb pointers will be invalidated! */
632                 if (tunnel->dev->type == ARPHRD_ETHER) {
633                         if (!pskb_may_pull(skb, ETH_HLEN)) {
634                                 stats->rx_length_errors++;
635                                 stats->rx_errors++;
636                                 goto drop;
637                         }
638
639                         iph = ip_hdr(skb);
640                         skb->protocol = eth_type_trans(skb, tunnel->dev);
641                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
642                 }
643
644                 skb_tunnel_rx(skb, tunnel->dev);
645
646                 skb_reset_network_header(skb);
647                 ipgre_ecn_decapsulate(iph, skb);
648
649                 netif_rx(skb);
650                 rcu_read_unlock();
651                 return(0);
652         }
653         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
654
655 drop:
656         rcu_read_unlock();
657 drop_nolock:
658         kfree_skb(skb);
659         return(0);
660 }
661
662 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
663 {
664         struct ip_tunnel *tunnel = netdev_priv(dev);
665         struct net_device_stats *stats = &dev->stats;
666         struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
667         struct iphdr  *old_iph = ip_hdr(skb);
668         struct iphdr  *tiph;
669         u8     tos;
670         __be16 df;
671         struct rtable *rt;                      /* Route to the other host */
672         struct net_device *tdev;                        /* Device to other host */
673         struct iphdr  *iph;                     /* Our new IP header */
674         unsigned int max_headroom;              /* The extra header space needed */
675         int    gre_hlen;
676         __be32 dst;
677         int    mtu;
678
679         if (dev->type == ARPHRD_ETHER)
680                 IPCB(skb)->flags = 0;
681
682         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
683                 gre_hlen = 0;
684                 tiph = (struct iphdr *)skb->data;
685         } else {
686                 gre_hlen = tunnel->hlen;
687                 tiph = &tunnel->parms.iph;
688         }
689
690         if ((dst = tiph->daddr) == 0) {
691                 /* NBMA tunnel */
692
693                 if (skb_dst(skb) == NULL) {
694                         stats->tx_fifo_errors++;
695                         goto tx_error;
696                 }
697
698                 if (skb->protocol == htons(ETH_P_IP)) {
699                         rt = skb_rtable(skb);
700                         if ((dst = rt->rt_gateway) == 0)
701                                 goto tx_error_icmp;
702                 }
703 #ifdef CONFIG_IPV6
704                 else if (skb->protocol == htons(ETH_P_IPV6)) {
705                         struct in6_addr *addr6;
706                         int addr_type;
707                         struct neighbour *neigh = skb_dst(skb)->neighbour;
708
709                         if (neigh == NULL)
710                                 goto tx_error;
711
712                         addr6 = (struct in6_addr *)&neigh->primary_key;
713                         addr_type = ipv6_addr_type(addr6);
714
715                         if (addr_type == IPV6_ADDR_ANY) {
716                                 addr6 = &ipv6_hdr(skb)->daddr;
717                                 addr_type = ipv6_addr_type(addr6);
718                         }
719
720                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
721                                 goto tx_error_icmp;
722
723                         dst = addr6->s6_addr32[3];
724                 }
725 #endif
726                 else
727                         goto tx_error;
728         }
729
730         tos = tiph->tos;
731         if (tos == 1) {
732                 tos = 0;
733                 if (skb->protocol == htons(ETH_P_IP))
734                         tos = old_iph->tos;
735                 else if (skb->protocol == htons(ETH_P_IPV6))
736                         tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
737         }
738
739         {
740                 struct flowi fl = { .oif = tunnel->parms.link,
741                                     .nl_u = { .ip4_u =
742                                               { .daddr = dst,
743                                                 .saddr = tiph->saddr,
744                                                 .tos = RT_TOS(tos) } },
745                                     .proto = IPPROTO_GRE };
746                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
747                         stats->tx_carrier_errors++;
748                         goto tx_error;
749                 }
750         }
751         tdev = rt->dst.dev;
752
753         if (tdev == dev) {
754                 ip_rt_put(rt);
755                 stats->collisions++;
756                 goto tx_error;
757         }
758
759         df = tiph->frag_off;
760         if (df)
761                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
762         else
763                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
764
765         if (skb_dst(skb))
766                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
767
768         if (skb->protocol == htons(ETH_P_IP)) {
769                 df |= (old_iph->frag_off&htons(IP_DF));
770
771                 if ((old_iph->frag_off&htons(IP_DF)) &&
772                     mtu < ntohs(old_iph->tot_len)) {
773                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
774                         ip_rt_put(rt);
775                         goto tx_error;
776                 }
777         }
778 #ifdef CONFIG_IPV6
779         else if (skb->protocol == htons(ETH_P_IPV6)) {
780                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
781
782                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
783                         if ((tunnel->parms.iph.daddr &&
784                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
785                             rt6->rt6i_dst.plen == 128) {
786                                 rt6->rt6i_flags |= RTF_MODIFIED;
787                                 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
788                         }
789                 }
790
791                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
792                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
793                         ip_rt_put(rt);
794                         goto tx_error;
795                 }
796         }
797 #endif
798
799         if (tunnel->err_count > 0) {
800                 if (time_before(jiffies,
801                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
802                         tunnel->err_count--;
803
804                         dst_link_failure(skb);
805                 } else
806                         tunnel->err_count = 0;
807         }
808
809         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
810
811         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
812             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
813                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
814                 if (max_headroom > dev->needed_headroom)
815                         dev->needed_headroom = max_headroom;
816                 if (!new_skb) {
817                         ip_rt_put(rt);
818                         txq->tx_dropped++;
819                         dev_kfree_skb(skb);
820                         return NETDEV_TX_OK;
821                 }
822                 if (skb->sk)
823                         skb_set_owner_w(new_skb, skb->sk);
824                 dev_kfree_skb(skb);
825                 skb = new_skb;
826                 old_iph = ip_hdr(skb);
827         }
828
829         skb_reset_transport_header(skb);
830         skb_push(skb, gre_hlen);
831         skb_reset_network_header(skb);
832         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
833         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
834                               IPSKB_REROUTED);
835         skb_dst_drop(skb);
836         skb_dst_set(skb, &rt->dst);
837
838         /*
839          *      Push down and install the IPIP header.
840          */
841
842         iph                     =       ip_hdr(skb);
843         iph->version            =       4;
844         iph->ihl                =       sizeof(struct iphdr) >> 2;
845         iph->frag_off           =       df;
846         iph->protocol           =       IPPROTO_GRE;
847         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
848         iph->daddr              =       rt->rt_dst;
849         iph->saddr              =       rt->rt_src;
850
851         if ((iph->ttl = tiph->ttl) == 0) {
852                 if (skb->protocol == htons(ETH_P_IP))
853                         iph->ttl = old_iph->ttl;
854 #ifdef CONFIG_IPV6
855                 else if (skb->protocol == htons(ETH_P_IPV6))
856                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
857 #endif
858                 else
859                         iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
860         }
861
862         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
863         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
864                                    htons(ETH_P_TEB) : skb->protocol;
865
866         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
867                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
868
869                 if (tunnel->parms.o_flags&GRE_SEQ) {
870                         ++tunnel->o_seqno;
871                         *ptr = htonl(tunnel->o_seqno);
872                         ptr--;
873                 }
874                 if (tunnel->parms.o_flags&GRE_KEY) {
875                         *ptr = tunnel->parms.o_key;
876                         ptr--;
877                 }
878                 if (tunnel->parms.o_flags&GRE_CSUM) {
879                         *ptr = 0;
880                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
881                 }
882         }
883
884         nf_reset(skb);
885
886         IPTUNNEL_XMIT();
887         return NETDEV_TX_OK;
888
889 tx_error_icmp:
890         dst_link_failure(skb);
891
892 tx_error:
893         stats->tx_errors++;
894         dev_kfree_skb(skb);
895         return NETDEV_TX_OK;
896 }
897
898 static int ipgre_tunnel_bind_dev(struct net_device *dev)
899 {
900         struct net_device *tdev = NULL;
901         struct ip_tunnel *tunnel;
902         struct iphdr *iph;
903         int hlen = LL_MAX_HEADER;
904         int mtu = ETH_DATA_LEN;
905         int addend = sizeof(struct iphdr) + 4;
906
907         tunnel = netdev_priv(dev);
908         iph = &tunnel->parms.iph;
909
910         /* Guess output device to choose reasonable mtu and needed_headroom */
911
912         if (iph->daddr) {
913                 struct flowi fl = { .oif = tunnel->parms.link,
914                                     .nl_u = { .ip4_u =
915                                               { .daddr = iph->daddr,
916                                                 .saddr = iph->saddr,
917                                                 .tos = RT_TOS(iph->tos) } },
918                                     .proto = IPPROTO_GRE };
919                 struct rtable *rt;
920                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
921                         tdev = rt->dst.dev;
922                         ip_rt_put(rt);
923                 }
924
925                 if (dev->type != ARPHRD_ETHER)
926                         dev->flags |= IFF_POINTOPOINT;
927         }
928
929         if (!tdev && tunnel->parms.link)
930                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
931
932         if (tdev) {
933                 hlen = tdev->hard_header_len + tdev->needed_headroom;
934                 mtu = tdev->mtu;
935         }
936         dev->iflink = tunnel->parms.link;
937
938         /* Precalculate GRE options length */
939         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
940                 if (tunnel->parms.o_flags&GRE_CSUM)
941                         addend += 4;
942                 if (tunnel->parms.o_flags&GRE_KEY)
943                         addend += 4;
944                 if (tunnel->parms.o_flags&GRE_SEQ)
945                         addend += 4;
946         }
947         dev->needed_headroom = addend + hlen;
948         mtu -= dev->hard_header_len + addend;
949
950         if (mtu < 68)
951                 mtu = 68;
952
953         tunnel->hlen = addend;
954
955         return mtu;
956 }
957
958 static int
959 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
960 {
961         int err = 0;
962         struct ip_tunnel_parm p;
963         struct ip_tunnel *t;
964         struct net *net = dev_net(dev);
965         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
966
967         switch (cmd) {
968         case SIOCGETTUNNEL:
969                 t = NULL;
970                 if (dev == ign->fb_tunnel_dev) {
971                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
972                                 err = -EFAULT;
973                                 break;
974                         }
975                         t = ipgre_tunnel_locate(net, &p, 0);
976                 }
977                 if (t == NULL)
978                         t = netdev_priv(dev);
979                 memcpy(&p, &t->parms, sizeof(p));
980                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
981                         err = -EFAULT;
982                 break;
983
984         case SIOCADDTUNNEL:
985         case SIOCCHGTUNNEL:
986                 err = -EPERM;
987                 if (!capable(CAP_NET_ADMIN))
988                         goto done;
989
990                 err = -EFAULT;
991                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
992                         goto done;
993
994                 err = -EINVAL;
995                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
996                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
997                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
998                         goto done;
999                 if (p.iph.ttl)
1000                         p.iph.frag_off |= htons(IP_DF);
1001
1002                 if (!(p.i_flags&GRE_KEY))
1003                         p.i_key = 0;
1004                 if (!(p.o_flags&GRE_KEY))
1005                         p.o_key = 0;
1006
1007                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1008
1009                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1010                         if (t != NULL) {
1011                                 if (t->dev != dev) {
1012                                         err = -EEXIST;
1013                                         break;
1014                                 }
1015                         } else {
1016                                 unsigned nflags = 0;
1017
1018                                 t = netdev_priv(dev);
1019
1020                                 if (ipv4_is_multicast(p.iph.daddr))
1021                                         nflags = IFF_BROADCAST;
1022                                 else if (p.iph.daddr)
1023                                         nflags = IFF_POINTOPOINT;
1024
1025                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1026                                         err = -EINVAL;
1027                                         break;
1028                                 }
1029                                 ipgre_tunnel_unlink(ign, t);
1030                                 t->parms.iph.saddr = p.iph.saddr;
1031                                 t->parms.iph.daddr = p.iph.daddr;
1032                                 t->parms.i_key = p.i_key;
1033                                 t->parms.o_key = p.o_key;
1034                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1035                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1036                                 ipgre_tunnel_link(ign, t);
1037                                 netdev_state_change(dev);
1038                         }
1039                 }
1040
1041                 if (t) {
1042                         err = 0;
1043                         if (cmd == SIOCCHGTUNNEL) {
1044                                 t->parms.iph.ttl = p.iph.ttl;
1045                                 t->parms.iph.tos = p.iph.tos;
1046                                 t->parms.iph.frag_off = p.iph.frag_off;
1047                                 if (t->parms.link != p.link) {
1048                                         t->parms.link = p.link;
1049                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1050                                         netdev_state_change(dev);
1051                                 }
1052                         }
1053                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1054                                 err = -EFAULT;
1055                 } else
1056                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1057                 break;
1058
1059         case SIOCDELTUNNEL:
1060                 err = -EPERM;
1061                 if (!capable(CAP_NET_ADMIN))
1062                         goto done;
1063
1064                 if (dev == ign->fb_tunnel_dev) {
1065                         err = -EFAULT;
1066                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1067                                 goto done;
1068                         err = -ENOENT;
1069                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1070                                 goto done;
1071                         err = -EPERM;
1072                         if (t == netdev_priv(ign->fb_tunnel_dev))
1073                                 goto done;
1074                         dev = t->dev;
1075                 }
1076                 unregister_netdevice(dev);
1077                 err = 0;
1078                 break;
1079
1080         default:
1081                 err = -EINVAL;
1082         }
1083
1084 done:
1085         return err;
1086 }
1087
1088 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1089 {
1090         struct ip_tunnel *tunnel = netdev_priv(dev);
1091         if (new_mtu < 68 ||
1092             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1093                 return -EINVAL;
1094         dev->mtu = new_mtu;
1095         return 0;
1096 }
1097
1098 /* Nice toy. Unfortunately, useless in real life :-)
1099    It allows to construct virtual multiprotocol broadcast "LAN"
1100    over the Internet, provided multicast routing is tuned.
1101
1102
1103    I have no idea was this bicycle invented before me,
1104    so that I had to set ARPHRD_IPGRE to a random value.
1105    I have an impression, that Cisco could make something similar,
1106    but this feature is apparently missing in IOS<=11.2(8).
1107
1108    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1109    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1110
1111    ping -t 255 224.66.66.66
1112
1113    If nobody answers, mbone does not work.
1114
1115    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1116    ip addr add 10.66.66.<somewhat>/24 dev Universe
1117    ifconfig Universe up
1118    ifconfig Universe add fe80::<Your_real_addr>/10
1119    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1120    ftp 10.66.66.66
1121    ...
1122    ftp fec0:6666:6666::193.233.7.65
1123    ...
1124
1125  */
1126
1127 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1128                         unsigned short type,
1129                         const void *daddr, const void *saddr, unsigned len)
1130 {
1131         struct ip_tunnel *t = netdev_priv(dev);
1132         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1133         __be16 *p = (__be16*)(iph+1);
1134
1135         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1136         p[0]            = t->parms.o_flags;
1137         p[1]            = htons(type);
1138
1139         /*
1140          *      Set the source hardware address.
1141          */
1142
1143         if (saddr)
1144                 memcpy(&iph->saddr, saddr, 4);
1145         if (daddr)
1146                 memcpy(&iph->daddr, daddr, 4);
1147         if (iph->daddr)
1148                 return t->hlen;
1149
1150         return -t->hlen;
1151 }
1152
1153 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1154 {
1155         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1156         memcpy(haddr, &iph->saddr, 4);
1157         return 4;
1158 }
1159
1160 static const struct header_ops ipgre_header_ops = {
1161         .create = ipgre_header,
1162         .parse  = ipgre_header_parse,
1163 };
1164
1165 #ifdef CONFIG_NET_IPGRE_BROADCAST
1166 static int ipgre_open(struct net_device *dev)
1167 {
1168         struct ip_tunnel *t = netdev_priv(dev);
1169
1170         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1171                 struct flowi fl = { .oif = t->parms.link,
1172                                     .nl_u = { .ip4_u =
1173                                               { .daddr = t->parms.iph.daddr,
1174                                                 .saddr = t->parms.iph.saddr,
1175                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1176                                     .proto = IPPROTO_GRE };
1177                 struct rtable *rt;
1178                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1179                         return -EADDRNOTAVAIL;
1180                 dev = rt->dst.dev;
1181                 ip_rt_put(rt);
1182                 if (__in_dev_get_rtnl(dev) == NULL)
1183                         return -EADDRNOTAVAIL;
1184                 t->mlink = dev->ifindex;
1185                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1186         }
1187         return 0;
1188 }
1189
1190 static int ipgre_close(struct net_device *dev)
1191 {
1192         struct ip_tunnel *t = netdev_priv(dev);
1193
1194         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1195                 struct in_device *in_dev;
1196                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1197                 if (in_dev) {
1198                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1199                         in_dev_put(in_dev);
1200                 }
1201         }
1202         return 0;
1203 }
1204
1205 #endif
1206
1207 static const struct net_device_ops ipgre_netdev_ops = {
1208         .ndo_init               = ipgre_tunnel_init,
1209         .ndo_uninit             = ipgre_tunnel_uninit,
1210 #ifdef CONFIG_NET_IPGRE_BROADCAST
1211         .ndo_open               = ipgre_open,
1212         .ndo_stop               = ipgre_close,
1213 #endif
1214         .ndo_start_xmit         = ipgre_tunnel_xmit,
1215         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1216         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1217 };
1218
1219 static void ipgre_tunnel_setup(struct net_device *dev)
1220 {
1221         dev->netdev_ops         = &ipgre_netdev_ops;
1222         dev->destructor         = free_netdev;
1223
1224         dev->type               = ARPHRD_IPGRE;
1225         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1226         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1227         dev->flags              = IFF_NOARP;
1228         dev->iflink             = 0;
1229         dev->addr_len           = 4;
1230         dev->features           |= NETIF_F_NETNS_LOCAL;
1231         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1232 }
1233
1234 static int ipgre_tunnel_init(struct net_device *dev)
1235 {
1236         struct ip_tunnel *tunnel;
1237         struct iphdr *iph;
1238
1239         tunnel = netdev_priv(dev);
1240         iph = &tunnel->parms.iph;
1241
1242         tunnel->dev = dev;
1243         strcpy(tunnel->parms.name, dev->name);
1244
1245         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1246         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1247
1248         if (iph->daddr) {
1249 #ifdef CONFIG_NET_IPGRE_BROADCAST
1250                 if (ipv4_is_multicast(iph->daddr)) {
1251                         if (!iph->saddr)
1252                                 return -EINVAL;
1253                         dev->flags = IFF_BROADCAST;
1254                         dev->header_ops = &ipgre_header_ops;
1255                 }
1256 #endif
1257         } else
1258                 dev->header_ops = &ipgre_header_ops;
1259
1260         return 0;
1261 }
1262
1263 static void ipgre_fb_tunnel_init(struct net_device *dev)
1264 {
1265         struct ip_tunnel *tunnel = netdev_priv(dev);
1266         struct iphdr *iph = &tunnel->parms.iph;
1267         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1268
1269         tunnel->dev = dev;
1270         strcpy(tunnel->parms.name, dev->name);
1271
1272         iph->version            = 4;
1273         iph->protocol           = IPPROTO_GRE;
1274         iph->ihl                = 5;
1275         tunnel->hlen            = sizeof(struct iphdr) + 4;
1276
1277         dev_hold(dev);
1278         ign->tunnels_wc[0]      = tunnel;
1279 }
1280
1281
1282 static const struct gre_protocol ipgre_protocol = {
1283         .handler     = ipgre_rcv,
1284         .err_handler = ipgre_err,
1285 };
1286
1287 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1288 {
1289         int prio;
1290
1291         for (prio = 0; prio < 4; prio++) {
1292                 int h;
1293                 for (h = 0; h < HASH_SIZE; h++) {
1294                         struct ip_tunnel *t = ign->tunnels[prio][h];
1295
1296                         while (t != NULL) {
1297                                 unregister_netdevice_queue(t->dev, head);
1298                                 t = t->next;
1299                         }
1300                 }
1301         }
1302 }
1303
1304 static int __net_init ipgre_init_net(struct net *net)
1305 {
1306         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1307         int err;
1308
1309         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1310                                            ipgre_tunnel_setup);
1311         if (!ign->fb_tunnel_dev) {
1312                 err = -ENOMEM;
1313                 goto err_alloc_dev;
1314         }
1315         dev_net_set(ign->fb_tunnel_dev, net);
1316
1317         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1318         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1319
1320         if ((err = register_netdev(ign->fb_tunnel_dev)))
1321                 goto err_reg_dev;
1322
1323         return 0;
1324
1325 err_reg_dev:
1326         free_netdev(ign->fb_tunnel_dev);
1327 err_alloc_dev:
1328         return err;
1329 }
1330
1331 static void __net_exit ipgre_exit_net(struct net *net)
1332 {
1333         struct ipgre_net *ign;
1334         LIST_HEAD(list);
1335
1336         ign = net_generic(net, ipgre_net_id);
1337         rtnl_lock();
1338         ipgre_destroy_tunnels(ign, &list);
1339         unregister_netdevice_many(&list);
1340         rtnl_unlock();
1341 }
1342
1343 static struct pernet_operations ipgre_net_ops = {
1344         .init = ipgre_init_net,
1345         .exit = ipgre_exit_net,
1346         .id   = &ipgre_net_id,
1347         .size = sizeof(struct ipgre_net),
1348 };
1349
1350 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1351 {
1352         __be16 flags;
1353
1354         if (!data)
1355                 return 0;
1356
1357         flags = 0;
1358         if (data[IFLA_GRE_IFLAGS])
1359                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1360         if (data[IFLA_GRE_OFLAGS])
1361                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1362         if (flags & (GRE_VERSION|GRE_ROUTING))
1363                 return -EINVAL;
1364
1365         return 0;
1366 }
1367
1368 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1369 {
1370         __be32 daddr;
1371
1372         if (tb[IFLA_ADDRESS]) {
1373                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1374                         return -EINVAL;
1375                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1376                         return -EADDRNOTAVAIL;
1377         }
1378
1379         if (!data)
1380                 goto out;
1381
1382         if (data[IFLA_GRE_REMOTE]) {
1383                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1384                 if (!daddr)
1385                         return -EINVAL;
1386         }
1387
1388 out:
1389         return ipgre_tunnel_validate(tb, data);
1390 }
1391
1392 static void ipgre_netlink_parms(struct nlattr *data[],
1393                                 struct ip_tunnel_parm *parms)
1394 {
1395         memset(parms, 0, sizeof(*parms));
1396
1397         parms->iph.protocol = IPPROTO_GRE;
1398
1399         if (!data)
1400                 return;
1401
1402         if (data[IFLA_GRE_LINK])
1403                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1404
1405         if (data[IFLA_GRE_IFLAGS])
1406                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1407
1408         if (data[IFLA_GRE_OFLAGS])
1409                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1410
1411         if (data[IFLA_GRE_IKEY])
1412                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1413
1414         if (data[IFLA_GRE_OKEY])
1415                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1416
1417         if (data[IFLA_GRE_LOCAL])
1418                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1419
1420         if (data[IFLA_GRE_REMOTE])
1421                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1422
1423         if (data[IFLA_GRE_TTL])
1424                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1425
1426         if (data[IFLA_GRE_TOS])
1427                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1428
1429         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1430                 parms->iph.frag_off = htons(IP_DF);
1431 }
1432
1433 static int ipgre_tap_init(struct net_device *dev)
1434 {
1435         struct ip_tunnel *tunnel;
1436
1437         tunnel = netdev_priv(dev);
1438
1439         tunnel->dev = dev;
1440         strcpy(tunnel->parms.name, dev->name);
1441
1442         ipgre_tunnel_bind_dev(dev);
1443
1444         return 0;
1445 }
1446
1447 static const struct net_device_ops ipgre_tap_netdev_ops = {
1448         .ndo_init               = ipgre_tap_init,
1449         .ndo_uninit             = ipgre_tunnel_uninit,
1450         .ndo_start_xmit         = ipgre_tunnel_xmit,
1451         .ndo_set_mac_address    = eth_mac_addr,
1452         .ndo_validate_addr      = eth_validate_addr,
1453         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1454 };
1455
1456 static void ipgre_tap_setup(struct net_device *dev)
1457 {
1458
1459         ether_setup(dev);
1460
1461         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1462         dev->destructor         = free_netdev;
1463
1464         dev->iflink             = 0;
1465         dev->features           |= NETIF_F_NETNS_LOCAL;
1466 }
1467
1468 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1469                          struct nlattr *data[])
1470 {
1471         struct ip_tunnel *nt;
1472         struct net *net = dev_net(dev);
1473         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1474         int mtu;
1475         int err;
1476
1477         nt = netdev_priv(dev);
1478         ipgre_netlink_parms(data, &nt->parms);
1479
1480         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1481                 return -EEXIST;
1482
1483         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1484                 random_ether_addr(dev->dev_addr);
1485
1486         mtu = ipgre_tunnel_bind_dev(dev);
1487         if (!tb[IFLA_MTU])
1488                 dev->mtu = mtu;
1489
1490         err = register_netdevice(dev);
1491         if (err)
1492                 goto out;
1493
1494         dev_hold(dev);
1495         ipgre_tunnel_link(ign, nt);
1496
1497 out:
1498         return err;
1499 }
1500
1501 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1502                             struct nlattr *data[])
1503 {
1504         struct ip_tunnel *t, *nt;
1505         struct net *net = dev_net(dev);
1506         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1507         struct ip_tunnel_parm p;
1508         int mtu;
1509
1510         if (dev == ign->fb_tunnel_dev)
1511                 return -EINVAL;
1512
1513         nt = netdev_priv(dev);
1514         ipgre_netlink_parms(data, &p);
1515
1516         t = ipgre_tunnel_locate(net, &p, 0);
1517
1518         if (t) {
1519                 if (t->dev != dev)
1520                         return -EEXIST;
1521         } else {
1522                 t = nt;
1523
1524                 if (dev->type != ARPHRD_ETHER) {
1525                         unsigned nflags = 0;
1526
1527                         if (ipv4_is_multicast(p.iph.daddr))
1528                                 nflags = IFF_BROADCAST;
1529                         else if (p.iph.daddr)
1530                                 nflags = IFF_POINTOPOINT;
1531
1532                         if ((dev->flags ^ nflags) &
1533                             (IFF_POINTOPOINT | IFF_BROADCAST))
1534                                 return -EINVAL;
1535                 }
1536
1537                 ipgre_tunnel_unlink(ign, t);
1538                 t->parms.iph.saddr = p.iph.saddr;
1539                 t->parms.iph.daddr = p.iph.daddr;
1540                 t->parms.i_key = p.i_key;
1541                 if (dev->type != ARPHRD_ETHER) {
1542                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1543                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1544                 }
1545                 ipgre_tunnel_link(ign, t);
1546                 netdev_state_change(dev);
1547         }
1548
1549         t->parms.o_key = p.o_key;
1550         t->parms.iph.ttl = p.iph.ttl;
1551         t->parms.iph.tos = p.iph.tos;
1552         t->parms.iph.frag_off = p.iph.frag_off;
1553
1554         if (t->parms.link != p.link) {
1555                 t->parms.link = p.link;
1556                 mtu = ipgre_tunnel_bind_dev(dev);
1557                 if (!tb[IFLA_MTU])
1558                         dev->mtu = mtu;
1559                 netdev_state_change(dev);
1560         }
1561
1562         return 0;
1563 }
1564
1565 static size_t ipgre_get_size(const struct net_device *dev)
1566 {
1567         return
1568                 /* IFLA_GRE_LINK */
1569                 nla_total_size(4) +
1570                 /* IFLA_GRE_IFLAGS */
1571                 nla_total_size(2) +
1572                 /* IFLA_GRE_OFLAGS */
1573                 nla_total_size(2) +
1574                 /* IFLA_GRE_IKEY */
1575                 nla_total_size(4) +
1576                 /* IFLA_GRE_OKEY */
1577                 nla_total_size(4) +
1578                 /* IFLA_GRE_LOCAL */
1579                 nla_total_size(4) +
1580                 /* IFLA_GRE_REMOTE */
1581                 nla_total_size(4) +
1582                 /* IFLA_GRE_TTL */
1583                 nla_total_size(1) +
1584                 /* IFLA_GRE_TOS */
1585                 nla_total_size(1) +
1586                 /* IFLA_GRE_PMTUDISC */
1587                 nla_total_size(1) +
1588                 0;
1589 }
1590
1591 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1592 {
1593         struct ip_tunnel *t = netdev_priv(dev);
1594         struct ip_tunnel_parm *p = &t->parms;
1595
1596         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1597         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1598         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1599         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1600         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1601         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1602         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1603         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1604         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1605         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1606
1607         return 0;
1608
1609 nla_put_failure:
1610         return -EMSGSIZE;
1611 }
1612
1613 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1614         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1615         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1616         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1617         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1618         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1619         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1620         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1621         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1622         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1623         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1624 };
1625
1626 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1627         .kind           = "gre",
1628         .maxtype        = IFLA_GRE_MAX,
1629         .policy         = ipgre_policy,
1630         .priv_size      = sizeof(struct ip_tunnel),
1631         .setup          = ipgre_tunnel_setup,
1632         .validate       = ipgre_tunnel_validate,
1633         .newlink        = ipgre_newlink,
1634         .changelink     = ipgre_changelink,
1635         .get_size       = ipgre_get_size,
1636         .fill_info      = ipgre_fill_info,
1637 };
1638
1639 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1640         .kind           = "gretap",
1641         .maxtype        = IFLA_GRE_MAX,
1642         .policy         = ipgre_policy,
1643         .priv_size      = sizeof(struct ip_tunnel),
1644         .setup          = ipgre_tap_setup,
1645         .validate       = ipgre_tap_validate,
1646         .newlink        = ipgre_newlink,
1647         .changelink     = ipgre_changelink,
1648         .get_size       = ipgre_get_size,
1649         .fill_info      = ipgre_fill_info,
1650 };
1651
1652 /*
1653  *      And now the modules code and kernel interface.
1654  */
1655
1656 static int __init ipgre_init(void)
1657 {
1658         int err;
1659
1660         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1661
1662         err = register_pernet_device(&ipgre_net_ops);
1663         if (err < 0)
1664                 return err;
1665
1666         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1667         if (err < 0) {
1668                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1669                 goto add_proto_failed;
1670         }
1671
1672         err = rtnl_link_register(&ipgre_link_ops);
1673         if (err < 0)
1674                 goto rtnl_link_failed;
1675
1676         err = rtnl_link_register(&ipgre_tap_ops);
1677         if (err < 0)
1678                 goto tap_ops_failed;
1679
1680 out:
1681         return err;
1682
1683 tap_ops_failed:
1684         rtnl_link_unregister(&ipgre_link_ops);
1685 rtnl_link_failed:
1686         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1687 add_proto_failed:
1688         unregister_pernet_device(&ipgre_net_ops);
1689         goto out;
1690 }
1691
1692 static void __exit ipgre_fini(void)
1693 {
1694         rtnl_link_unregister(&ipgre_tap_ops);
1695         rtnl_link_unregister(&ipgre_link_ops);
1696         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1697                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1698         unregister_pernet_device(&ipgre_net_ops);
1699 }
1700
1701 module_init(ipgre_init);
1702 module_exit(ipgre_fini);
1703 MODULE_LICENSE("GPL");
1704 MODULE_ALIAS_RTNL_LINK("gre");
1705 MODULE_ALIAS_RTNL_LINK("gretap");