]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv4/ip_gre.c
net: Support specifying the network namespace upon device creation.
[net-next-2.6.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <asm/uaccess.h>
18 #include <linux/skbuff.h>
19 #include <linux/netdevice.h>
20 #include <linux/in.h>
21 #include <linux/tcp.h>
22 #include <linux/udp.h>
23 #include <linux/if_arp.h>
24 #include <linux/mroute.h>
25 #include <linux/init.h>
26 #include <linux/in6.h>
27 #include <linux/inetdevice.h>
28 #include <linux/igmp.h>
29 #include <linux/netfilter_ipv4.h>
30 #include <linux/etherdevice.h>
31 #include <linux/if_ether.h>
32
33 #include <net/sock.h>
34 #include <net/ip.h>
35 #include <net/icmp.h>
36 #include <net/protocol.h>
37 #include <net/ipip.h>
38 #include <net/arp.h>
39 #include <net/checksum.h>
40 #include <net/dsfield.h>
41 #include <net/inet_ecn.h>
42 #include <net/xfrm.h>
43 #include <net/net_namespace.h>
44 #include <net/netns/generic.h>
45 #include <net/rtnetlink.h>
46
47 #ifdef CONFIG_IPV6
48 #include <net/ipv6.h>
49 #include <net/ip6_fib.h>
50 #include <net/ip6_route.h>
51 #endif
52
53 /*
54    Problems & solutions
55    --------------------
56
57    1. The most important issue is detecting local dead loops.
58    They would cause complete host lockup in transmit, which
59    would be "resolved" by stack overflow or, if queueing is enabled,
60    with infinite looping in net_bh.
61
62    We cannot track such dead loops during route installation,
63    it is infeasible task. The most general solutions would be
64    to keep skb->encapsulation counter (sort of local ttl),
65    and silently drop packet when it expires. It is the best
66    solution, but it supposes maintaing new variable in ALL
67    skb, even if no tunneling is used.
68
69    Current solution: HARD_TX_LOCK lock breaks dead loops.
70
71
72
73    2. Networking dead loops would not kill routers, but would really
74    kill network. IP hop limit plays role of "t->recursion" in this case,
75    if we copy it from packet being encapsulated to upper header.
76    It is very good solution, but it introduces two problems:
77
78    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
79      do not work over tunnels.
80    - traceroute does not work. I planned to relay ICMP from tunnel,
81      so that this problem would be solved and traceroute output
82      would even more informative. This idea appeared to be wrong:
83      only Linux complies to rfc1812 now (yes, guys, Linux is the only
84      true router now :-)), all routers (at least, in neighbourhood of mine)
85      return only 8 bytes of payload. It is the end.
86
87    Hence, if we want that OSPF worked or traceroute said something reasonable,
88    we should search for another solution.
89
90    One of them is to parse packet trying to detect inner encapsulation
91    made by our node. It is difficult or even impossible, especially,
92    taking into account fragmentation. TO be short, tt is not solution at all.
93
94    Current solution: The solution was UNEXPECTEDLY SIMPLE.
95    We force DF flag on tunnels with preconfigured hop limit,
96    that is ALL. :-) Well, it does not remove the problem completely,
97    but exponential growth of network traffic is changed to linear
98    (branches, that exceed pmtu are pruned) and tunnel mtu
99    fastly degrades to value <68, where looping stops.
100    Yes, it is not good if there exists a router in the loop,
101    which does not force DF, even when encapsulating packets have DF set.
102    But it is not our problem! Nobody could accuse us, we made
103    all that we could make. Even if it is your gated who injected
104    fatal route to network, even if it were you who configured
105    fatal static route: you are innocent. :-)
106
107
108
109    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
110    practically identical code. It would be good to glue them
111    together, but it is not very evident, how to make them modular.
112    sit is integral part of IPv6, ipip and gre are naturally modular.
113    We could extract common parts (hash table, ioctl etc)
114    to a separate module (ip_tunnel.c).
115
116    Alexey Kuznetsov.
117  */
118
119 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
120 static int ipgre_tunnel_init(struct net_device *dev);
121 static void ipgre_tunnel_setup(struct net_device *dev);
122 static int ipgre_tunnel_bind_dev(struct net_device *dev);
123
124 /* Fallback tunnel: no source, no destination, no key, no options */
125
126 #define HASH_SIZE  16
127
128 static int ipgre_net_id;
129 struct ipgre_net {
130         struct ip_tunnel *tunnels[4][HASH_SIZE];
131
132         struct net_device *fb_tunnel_dev;
133 };
134
135 /* Tunnel hash table */
136
137 /*
138    4 hash tables:
139
140    3: (remote,local)
141    2: (remote,*)
142    1: (*,local)
143    0: (*,*)
144
145    We require exact key match i.e. if a key is present in packet
146    it will match only tunnel with the same key; if it is not present,
147    it will match only keyless tunnel.
148
149    All keysless packets, if not matched configured keyless tunnels
150    will match fallback tunnel.
151  */
152
153 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
154
155 #define tunnels_r_l     tunnels[3]
156 #define tunnels_r       tunnels[2]
157 #define tunnels_l       tunnels[1]
158 #define tunnels_wc      tunnels[0]
159 /*
160  * Locking : hash tables are protected by RCU and a spinlock
161  */
162 static DEFINE_SPINLOCK(ipgre_lock);
163
164 #define for_each_ip_tunnel_rcu(start) \
165         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
166
167 /* Given src, dst and key, find appropriate for input tunnel. */
168
169 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
170                                               __be32 remote, __be32 local,
171                                               __be32 key, __be16 gre_proto)
172 {
173         struct net *net = dev_net(dev);
174         int link = dev->ifindex;
175         unsigned h0 = HASH(remote);
176         unsigned h1 = HASH(key);
177         struct ip_tunnel *t, *cand = NULL;
178         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
179         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
180                        ARPHRD_ETHER : ARPHRD_IPGRE;
181         int score, cand_score = 4;
182
183         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
184                 if (local != t->parms.iph.saddr ||
185                     remote != t->parms.iph.daddr ||
186                     key != t->parms.i_key ||
187                     !(t->dev->flags & IFF_UP))
188                         continue;
189
190                 if (t->dev->type != ARPHRD_IPGRE &&
191                     t->dev->type != dev_type)
192                         continue;
193
194                 score = 0;
195                 if (t->parms.link != link)
196                         score |= 1;
197                 if (t->dev->type != dev_type)
198                         score |= 2;
199                 if (score == 0)
200                         return t;
201
202                 if (score < cand_score) {
203                         cand = t;
204                         cand_score = score;
205                 }
206         }
207
208         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
209                 if (remote != t->parms.iph.daddr ||
210                     key != t->parms.i_key ||
211                     !(t->dev->flags & IFF_UP))
212                         continue;
213
214                 if (t->dev->type != ARPHRD_IPGRE &&
215                     t->dev->type != dev_type)
216                         continue;
217
218                 score = 0;
219                 if (t->parms.link != link)
220                         score |= 1;
221                 if (t->dev->type != dev_type)
222                         score |= 2;
223                 if (score == 0)
224                         return t;
225
226                 if (score < cand_score) {
227                         cand = t;
228                         cand_score = score;
229                 }
230         }
231
232         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
233                 if ((local != t->parms.iph.saddr &&
234                      (local != t->parms.iph.daddr ||
235                       !ipv4_is_multicast(local))) ||
236                     key != t->parms.i_key ||
237                     !(t->dev->flags & IFF_UP))
238                         continue;
239
240                 if (t->dev->type != ARPHRD_IPGRE &&
241                     t->dev->type != dev_type)
242                         continue;
243
244                 score = 0;
245                 if (t->parms.link != link)
246                         score |= 1;
247                 if (t->dev->type != dev_type)
248                         score |= 2;
249                 if (score == 0)
250                         return t;
251
252                 if (score < cand_score) {
253                         cand = t;
254                         cand_score = score;
255                 }
256         }
257
258         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
259                 if (t->parms.i_key != key ||
260                     !(t->dev->flags & IFF_UP))
261                         continue;
262
263                 if (t->dev->type != ARPHRD_IPGRE &&
264                     t->dev->type != dev_type)
265                         continue;
266
267                 score = 0;
268                 if (t->parms.link != link)
269                         score |= 1;
270                 if (t->dev->type != dev_type)
271                         score |= 2;
272                 if (score == 0)
273                         return t;
274
275                 if (score < cand_score) {
276                         cand = t;
277                         cand_score = score;
278                 }
279         }
280
281         if (cand != NULL)
282                 return cand;
283
284         dev = ign->fb_tunnel_dev;
285         if (dev->flags & IFF_UP)
286                 return netdev_priv(dev);
287
288         return NULL;
289 }
290
291 static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign,
292                 struct ip_tunnel_parm *parms)
293 {
294         __be32 remote = parms->iph.daddr;
295         __be32 local = parms->iph.saddr;
296         __be32 key = parms->i_key;
297         unsigned h = HASH(key);
298         int prio = 0;
299
300         if (local)
301                 prio |= 1;
302         if (remote && !ipv4_is_multicast(remote)) {
303                 prio |= 2;
304                 h ^= HASH(remote);
305         }
306
307         return &ign->tunnels[prio][h];
308 }
309
310 static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign,
311                 struct ip_tunnel *t)
312 {
313         return __ipgre_bucket(ign, &t->parms);
314 }
315
316 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
317 {
318         struct ip_tunnel **tp = ipgre_bucket(ign, t);
319
320         spin_lock_bh(&ipgre_lock);
321         t->next = *tp;
322         rcu_assign_pointer(*tp, t);
323         spin_unlock_bh(&ipgre_lock);
324 }
325
326 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
327 {
328         struct ip_tunnel **tp;
329
330         for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) {
331                 if (t == *tp) {
332                         spin_lock_bh(&ipgre_lock);
333                         *tp = t->next;
334                         spin_unlock_bh(&ipgre_lock);
335                         break;
336                 }
337         }
338 }
339
340 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
341                                            struct ip_tunnel_parm *parms,
342                                            int type)
343 {
344         __be32 remote = parms->iph.daddr;
345         __be32 local = parms->iph.saddr;
346         __be32 key = parms->i_key;
347         int link = parms->link;
348         struct ip_tunnel *t, **tp;
349         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
350
351         for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next)
352                 if (local == t->parms.iph.saddr &&
353                     remote == t->parms.iph.daddr &&
354                     key == t->parms.i_key &&
355                     link == t->parms.link &&
356                     type == t->dev->type)
357                         break;
358
359         return t;
360 }
361
362 static struct ip_tunnel * ipgre_tunnel_locate(struct net *net,
363                 struct ip_tunnel_parm *parms, int create)
364 {
365         struct ip_tunnel *t, *nt;
366         struct net_device *dev;
367         char name[IFNAMSIZ];
368         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
369
370         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
371         if (t || !create)
372                 return t;
373
374         if (parms->name[0])
375                 strlcpy(name, parms->name, IFNAMSIZ);
376         else
377                 sprintf(name, "gre%%d");
378
379         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
380         if (!dev)
381           return NULL;
382
383         dev_net_set(dev, net);
384
385         if (strchr(name, '%')) {
386                 if (dev_alloc_name(dev, name) < 0)
387                         goto failed_free;
388         }
389
390         nt = netdev_priv(dev);
391         nt->parms = *parms;
392         dev->rtnl_link_ops = &ipgre_link_ops;
393
394         dev->mtu = ipgre_tunnel_bind_dev(dev);
395
396         if (register_netdevice(dev) < 0)
397                 goto failed_free;
398
399         dev_hold(dev);
400         ipgre_tunnel_link(ign, nt);
401         return nt;
402
403 failed_free:
404         free_netdev(dev);
405         return NULL;
406 }
407
408 static void ipgre_tunnel_uninit(struct net_device *dev)
409 {
410         struct net *net = dev_net(dev);
411         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
412
413         ipgre_tunnel_unlink(ign, netdev_priv(dev));
414         dev_put(dev);
415 }
416
417
418 static void ipgre_err(struct sk_buff *skb, u32 info)
419 {
420
421 /* All the routers (except for Linux) return only
422    8 bytes of packet payload. It means, that precise relaying of
423    ICMP in the real Internet is absolutely infeasible.
424
425    Moreover, Cisco "wise men" put GRE key to the third word
426    in GRE header. It makes impossible maintaining even soft state for keyed
427    GRE tunnels with enabled checksum. Tell them "thank you".
428
429    Well, I wonder, rfc1812 was written by Cisco employee,
430    what the hell these idiots break standrads established
431    by themself???
432  */
433
434         struct iphdr *iph = (struct iphdr *)skb->data;
435         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
436         int grehlen = (iph->ihl<<2) + 4;
437         const int type = icmp_hdr(skb)->type;
438         const int code = icmp_hdr(skb)->code;
439         struct ip_tunnel *t;
440         __be16 flags;
441
442         flags = p[0];
443         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
444                 if (flags&(GRE_VERSION|GRE_ROUTING))
445                         return;
446                 if (flags&GRE_KEY) {
447                         grehlen += 4;
448                         if (flags&GRE_CSUM)
449                                 grehlen += 4;
450                 }
451         }
452
453         /* If only 8 bytes returned, keyed message will be dropped here */
454         if (skb_headlen(skb) < grehlen)
455                 return;
456
457         switch (type) {
458         default:
459         case ICMP_PARAMETERPROB:
460                 return;
461
462         case ICMP_DEST_UNREACH:
463                 switch (code) {
464                 case ICMP_SR_FAILED:
465                 case ICMP_PORT_UNREACH:
466                         /* Impossible event. */
467                         return;
468                 case ICMP_FRAG_NEEDED:
469                         /* Soft state for pmtu is maintained by IP core. */
470                         return;
471                 default:
472                         /* All others are translated to HOST_UNREACH.
473                            rfc2003 contains "deep thoughts" about NET_UNREACH,
474                            I believe they are just ether pollution. --ANK
475                          */
476                         break;
477                 }
478                 break;
479         case ICMP_TIME_EXCEEDED:
480                 if (code != ICMP_EXC_TTL)
481                         return;
482                 break;
483         }
484
485         rcu_read_lock();
486         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
487                                 flags & GRE_KEY ?
488                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
489                                 p[1]);
490         if (t == NULL || t->parms.iph.daddr == 0 ||
491             ipv4_is_multicast(t->parms.iph.daddr))
492                 goto out;
493
494         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
495                 goto out;
496
497         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
498                 t->err_count++;
499         else
500                 t->err_count = 1;
501         t->err_time = jiffies;
502 out:
503         rcu_read_unlock();
504         return;
505 }
506
507 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
508 {
509         if (INET_ECN_is_ce(iph->tos)) {
510                 if (skb->protocol == htons(ETH_P_IP)) {
511                         IP_ECN_set_ce(ip_hdr(skb));
512                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
513                         IP6_ECN_set_ce(ipv6_hdr(skb));
514                 }
515         }
516 }
517
518 static inline u8
519 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
520 {
521         u8 inner = 0;
522         if (skb->protocol == htons(ETH_P_IP))
523                 inner = old_iph->tos;
524         else if (skb->protocol == htons(ETH_P_IPV6))
525                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
526         return INET_ECN_encapsulate(tos, inner);
527 }
528
529 static int ipgre_rcv(struct sk_buff *skb)
530 {
531         struct iphdr *iph;
532         u8     *h;
533         __be16    flags;
534         __sum16   csum = 0;
535         __be32 key = 0;
536         u32    seqno = 0;
537         struct ip_tunnel *tunnel;
538         int    offset = 4;
539         __be16 gre_proto;
540         unsigned int len;
541
542         if (!pskb_may_pull(skb, 16))
543                 goto drop_nolock;
544
545         iph = ip_hdr(skb);
546         h = skb->data;
547         flags = *(__be16*)h;
548
549         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
550                 /* - Version must be 0.
551                    - We do not support routing headers.
552                  */
553                 if (flags&(GRE_VERSION|GRE_ROUTING))
554                         goto drop_nolock;
555
556                 if (flags&GRE_CSUM) {
557                         switch (skb->ip_summed) {
558                         case CHECKSUM_COMPLETE:
559                                 csum = csum_fold(skb->csum);
560                                 if (!csum)
561                                         break;
562                                 /* fall through */
563                         case CHECKSUM_NONE:
564                                 skb->csum = 0;
565                                 csum = __skb_checksum_complete(skb);
566                                 skb->ip_summed = CHECKSUM_COMPLETE;
567                         }
568                         offset += 4;
569                 }
570                 if (flags&GRE_KEY) {
571                         key = *(__be32*)(h + offset);
572                         offset += 4;
573                 }
574                 if (flags&GRE_SEQ) {
575                         seqno = ntohl(*(__be32*)(h + offset));
576                         offset += 4;
577                 }
578         }
579
580         gre_proto = *(__be16 *)(h + 2);
581
582         rcu_read_lock();
583         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
584                                           iph->saddr, iph->daddr, key,
585                                           gre_proto))) {
586                 struct net_device_stats *stats = &tunnel->dev->stats;
587
588                 secpath_reset(skb);
589
590                 skb->protocol = gre_proto;
591                 /* WCCP version 1 and 2 protocol decoding.
592                  * - Change protocol to IP
593                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
594                  */
595                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
596                         skb->protocol = htons(ETH_P_IP);
597                         if ((*(h + offset) & 0xF0) != 0x40)
598                                 offset += 4;
599                 }
600
601                 skb->mac_header = skb->network_header;
602                 __pskb_pull(skb, offset);
603                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
604                 skb->pkt_type = PACKET_HOST;
605 #ifdef CONFIG_NET_IPGRE_BROADCAST
606                 if (ipv4_is_multicast(iph->daddr)) {
607                         /* Looped back packet, drop it! */
608                         if (skb_rtable(skb)->fl.iif == 0)
609                                 goto drop;
610                         stats->multicast++;
611                         skb->pkt_type = PACKET_BROADCAST;
612                 }
613 #endif
614
615                 if (((flags&GRE_CSUM) && csum) ||
616                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
617                         stats->rx_crc_errors++;
618                         stats->rx_errors++;
619                         goto drop;
620                 }
621                 if (tunnel->parms.i_flags&GRE_SEQ) {
622                         if (!(flags&GRE_SEQ) ||
623                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
624                                 stats->rx_fifo_errors++;
625                                 stats->rx_errors++;
626                                 goto drop;
627                         }
628                         tunnel->i_seqno = seqno + 1;
629                 }
630
631                 len = skb->len;
632
633                 /* Warning: All skb pointers will be invalidated! */
634                 if (tunnel->dev->type == ARPHRD_ETHER) {
635                         if (!pskb_may_pull(skb, ETH_HLEN)) {
636                                 stats->rx_length_errors++;
637                                 stats->rx_errors++;
638                                 goto drop;
639                         }
640
641                         iph = ip_hdr(skb);
642                         skb->protocol = eth_type_trans(skb, tunnel->dev);
643                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
644                 }
645
646                 stats->rx_packets++;
647                 stats->rx_bytes += len;
648                 skb->dev = tunnel->dev;
649                 skb_dst_drop(skb);
650                 nf_reset(skb);
651
652                 skb_reset_network_header(skb);
653                 ipgre_ecn_decapsulate(iph, skb);
654
655                 netif_rx(skb);
656                 rcu_read_unlock();
657                 return(0);
658         }
659         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
660
661 drop:
662         rcu_read_unlock();
663 drop_nolock:
664         kfree_skb(skb);
665         return(0);
666 }
667
668 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
669 {
670         struct ip_tunnel *tunnel = netdev_priv(dev);
671         struct net_device_stats *stats = &dev->stats;
672         struct netdev_queue *txq = netdev_get_tx_queue(dev, 0);
673         struct iphdr  *old_iph = ip_hdr(skb);
674         struct iphdr  *tiph;
675         u8     tos;
676         __be16 df;
677         struct rtable *rt;                      /* Route to the other host */
678         struct net_device *tdev;                        /* Device to other host */
679         struct iphdr  *iph;                     /* Our new IP header */
680         unsigned int max_headroom;              /* The extra header space needed */
681         int    gre_hlen;
682         __be32 dst;
683         int    mtu;
684
685         if (dev->type == ARPHRD_ETHER)
686                 IPCB(skb)->flags = 0;
687
688         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
689                 gre_hlen = 0;
690                 tiph = (struct iphdr *)skb->data;
691         } else {
692                 gre_hlen = tunnel->hlen;
693                 tiph = &tunnel->parms.iph;
694         }
695
696         if ((dst = tiph->daddr) == 0) {
697                 /* NBMA tunnel */
698
699                 if (skb_dst(skb) == NULL) {
700                         stats->tx_fifo_errors++;
701                         goto tx_error;
702                 }
703
704                 if (skb->protocol == htons(ETH_P_IP)) {
705                         rt = skb_rtable(skb);
706                         if ((dst = rt->rt_gateway) == 0)
707                                 goto tx_error_icmp;
708                 }
709 #ifdef CONFIG_IPV6
710                 else if (skb->protocol == htons(ETH_P_IPV6)) {
711                         struct in6_addr *addr6;
712                         int addr_type;
713                         struct neighbour *neigh = skb_dst(skb)->neighbour;
714
715                         if (neigh == NULL)
716                                 goto tx_error;
717
718                         addr6 = (struct in6_addr *)&neigh->primary_key;
719                         addr_type = ipv6_addr_type(addr6);
720
721                         if (addr_type == IPV6_ADDR_ANY) {
722                                 addr6 = &ipv6_hdr(skb)->daddr;
723                                 addr_type = ipv6_addr_type(addr6);
724                         }
725
726                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
727                                 goto tx_error_icmp;
728
729                         dst = addr6->s6_addr32[3];
730                 }
731 #endif
732                 else
733                         goto tx_error;
734         }
735
736         tos = tiph->tos;
737         if (tos == 1) {
738                 tos = 0;
739                 if (skb->protocol == htons(ETH_P_IP))
740                         tos = old_iph->tos;
741         }
742
743         {
744                 struct flowi fl = { .oif = tunnel->parms.link,
745                                     .nl_u = { .ip4_u =
746                                               { .daddr = dst,
747                                                 .saddr = tiph->saddr,
748                                                 .tos = RT_TOS(tos) } },
749                                     .proto = IPPROTO_GRE };
750                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
751                         stats->tx_carrier_errors++;
752                         goto tx_error;
753                 }
754         }
755         tdev = rt->u.dst.dev;
756
757         if (tdev == dev) {
758                 ip_rt_put(rt);
759                 stats->collisions++;
760                 goto tx_error;
761         }
762
763         df = tiph->frag_off;
764         if (df)
765                 mtu = dst_mtu(&rt->u.dst) - dev->hard_header_len - tunnel->hlen;
766         else
767                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
768
769         if (skb_dst(skb))
770                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
771
772         if (skb->protocol == htons(ETH_P_IP)) {
773                 df |= (old_iph->frag_off&htons(IP_DF));
774
775                 if ((old_iph->frag_off&htons(IP_DF)) &&
776                     mtu < ntohs(old_iph->tot_len)) {
777                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
778                         ip_rt_put(rt);
779                         goto tx_error;
780                 }
781         }
782 #ifdef CONFIG_IPV6
783         else if (skb->protocol == htons(ETH_P_IPV6)) {
784                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
785
786                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
787                         if ((tunnel->parms.iph.daddr &&
788                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
789                             rt6->rt6i_dst.plen == 128) {
790                                 rt6->rt6i_flags |= RTF_MODIFIED;
791                                 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
792                         }
793                 }
794
795                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
796                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, dev);
797                         ip_rt_put(rt);
798                         goto tx_error;
799                 }
800         }
801 #endif
802
803         if (tunnel->err_count > 0) {
804                 if (time_before(jiffies,
805                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
806                         tunnel->err_count--;
807
808                         dst_link_failure(skb);
809                 } else
810                         tunnel->err_count = 0;
811         }
812
813         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen;
814
815         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
816             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
817                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
818                 if (!new_skb) {
819                         ip_rt_put(rt);
820                         txq->tx_dropped++;
821                         dev_kfree_skb(skb);
822                         return NETDEV_TX_OK;
823                 }
824                 if (skb->sk)
825                         skb_set_owner_w(new_skb, skb->sk);
826                 dev_kfree_skb(skb);
827                 skb = new_skb;
828                 old_iph = ip_hdr(skb);
829         }
830
831         skb_reset_transport_header(skb);
832         skb_push(skb, gre_hlen);
833         skb_reset_network_header(skb);
834         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
835         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
836                               IPSKB_REROUTED);
837         skb_dst_drop(skb);
838         skb_dst_set(skb, &rt->u.dst);
839
840         /*
841          *      Push down and install the IPIP header.
842          */
843
844         iph                     =       ip_hdr(skb);
845         iph->version            =       4;
846         iph->ihl                =       sizeof(struct iphdr) >> 2;
847         iph->frag_off           =       df;
848         iph->protocol           =       IPPROTO_GRE;
849         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
850         iph->daddr              =       rt->rt_dst;
851         iph->saddr              =       rt->rt_src;
852
853         if ((iph->ttl = tiph->ttl) == 0) {
854                 if (skb->protocol == htons(ETH_P_IP))
855                         iph->ttl = old_iph->ttl;
856 #ifdef CONFIG_IPV6
857                 else if (skb->protocol == htons(ETH_P_IPV6))
858                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
859 #endif
860                 else
861                         iph->ttl = dst_metric(&rt->u.dst, RTAX_HOPLIMIT);
862         }
863
864         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
865         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
866                                    htons(ETH_P_TEB) : skb->protocol;
867
868         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
869                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
870
871                 if (tunnel->parms.o_flags&GRE_SEQ) {
872                         ++tunnel->o_seqno;
873                         *ptr = htonl(tunnel->o_seqno);
874                         ptr--;
875                 }
876                 if (tunnel->parms.o_flags&GRE_KEY) {
877                         *ptr = tunnel->parms.o_key;
878                         ptr--;
879                 }
880                 if (tunnel->parms.o_flags&GRE_CSUM) {
881                         *ptr = 0;
882                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
883                 }
884         }
885
886         nf_reset(skb);
887
888         IPTUNNEL_XMIT();
889         return NETDEV_TX_OK;
890
891 tx_error_icmp:
892         dst_link_failure(skb);
893
894 tx_error:
895         stats->tx_errors++;
896         dev_kfree_skb(skb);
897         return NETDEV_TX_OK;
898 }
899
900 static int ipgre_tunnel_bind_dev(struct net_device *dev)
901 {
902         struct net_device *tdev = NULL;
903         struct ip_tunnel *tunnel;
904         struct iphdr *iph;
905         int hlen = LL_MAX_HEADER;
906         int mtu = ETH_DATA_LEN;
907         int addend = sizeof(struct iphdr) + 4;
908
909         tunnel = netdev_priv(dev);
910         iph = &tunnel->parms.iph;
911
912         /* Guess output device to choose reasonable mtu and needed_headroom */
913
914         if (iph->daddr) {
915                 struct flowi fl = { .oif = tunnel->parms.link,
916                                     .nl_u = { .ip4_u =
917                                               { .daddr = iph->daddr,
918                                                 .saddr = iph->saddr,
919                                                 .tos = RT_TOS(iph->tos) } },
920                                     .proto = IPPROTO_GRE };
921                 struct rtable *rt;
922                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
923                         tdev = rt->u.dst.dev;
924                         ip_rt_put(rt);
925                 }
926
927                 if (dev->type != ARPHRD_ETHER)
928                         dev->flags |= IFF_POINTOPOINT;
929         }
930
931         if (!tdev && tunnel->parms.link)
932                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
933
934         if (tdev) {
935                 hlen = tdev->hard_header_len + tdev->needed_headroom;
936                 mtu = tdev->mtu;
937         }
938         dev->iflink = tunnel->parms.link;
939
940         /* Precalculate GRE options length */
941         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
942                 if (tunnel->parms.o_flags&GRE_CSUM)
943                         addend += 4;
944                 if (tunnel->parms.o_flags&GRE_KEY)
945                         addend += 4;
946                 if (tunnel->parms.o_flags&GRE_SEQ)
947                         addend += 4;
948         }
949         dev->needed_headroom = addend + hlen;
950         mtu -= dev->hard_header_len + addend;
951
952         if (mtu < 68)
953                 mtu = 68;
954
955         tunnel->hlen = addend;
956
957         return mtu;
958 }
959
960 static int
961 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
962 {
963         int err = 0;
964         struct ip_tunnel_parm p;
965         struct ip_tunnel *t;
966         struct net *net = dev_net(dev);
967         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
968
969         switch (cmd) {
970         case SIOCGETTUNNEL:
971                 t = NULL;
972                 if (dev == ign->fb_tunnel_dev) {
973                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
974                                 err = -EFAULT;
975                                 break;
976                         }
977                         t = ipgre_tunnel_locate(net, &p, 0);
978                 }
979                 if (t == NULL)
980                         t = netdev_priv(dev);
981                 memcpy(&p, &t->parms, sizeof(p));
982                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
983                         err = -EFAULT;
984                 break;
985
986         case SIOCADDTUNNEL:
987         case SIOCCHGTUNNEL:
988                 err = -EPERM;
989                 if (!capable(CAP_NET_ADMIN))
990                         goto done;
991
992                 err = -EFAULT;
993                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
994                         goto done;
995
996                 err = -EINVAL;
997                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
998                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
999                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1000                         goto done;
1001                 if (p.iph.ttl)
1002                         p.iph.frag_off |= htons(IP_DF);
1003
1004                 if (!(p.i_flags&GRE_KEY))
1005                         p.i_key = 0;
1006                 if (!(p.o_flags&GRE_KEY))
1007                         p.o_key = 0;
1008
1009                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1010
1011                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1012                         if (t != NULL) {
1013                                 if (t->dev != dev) {
1014                                         err = -EEXIST;
1015                                         break;
1016                                 }
1017                         } else {
1018                                 unsigned nflags = 0;
1019
1020                                 t = netdev_priv(dev);
1021
1022                                 if (ipv4_is_multicast(p.iph.daddr))
1023                                         nflags = IFF_BROADCAST;
1024                                 else if (p.iph.daddr)
1025                                         nflags = IFF_POINTOPOINT;
1026
1027                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1028                                         err = -EINVAL;
1029                                         break;
1030                                 }
1031                                 ipgre_tunnel_unlink(ign, t);
1032                                 t->parms.iph.saddr = p.iph.saddr;
1033                                 t->parms.iph.daddr = p.iph.daddr;
1034                                 t->parms.i_key = p.i_key;
1035                                 t->parms.o_key = p.o_key;
1036                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1037                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1038                                 ipgre_tunnel_link(ign, t);
1039                                 netdev_state_change(dev);
1040                         }
1041                 }
1042
1043                 if (t) {
1044                         err = 0;
1045                         if (cmd == SIOCCHGTUNNEL) {
1046                                 t->parms.iph.ttl = p.iph.ttl;
1047                                 t->parms.iph.tos = p.iph.tos;
1048                                 t->parms.iph.frag_off = p.iph.frag_off;
1049                                 if (t->parms.link != p.link) {
1050                                         t->parms.link = p.link;
1051                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1052                                         netdev_state_change(dev);
1053                                 }
1054                         }
1055                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1056                                 err = -EFAULT;
1057                 } else
1058                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1059                 break;
1060
1061         case SIOCDELTUNNEL:
1062                 err = -EPERM;
1063                 if (!capable(CAP_NET_ADMIN))
1064                         goto done;
1065
1066                 if (dev == ign->fb_tunnel_dev) {
1067                         err = -EFAULT;
1068                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1069                                 goto done;
1070                         err = -ENOENT;
1071                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1072                                 goto done;
1073                         err = -EPERM;
1074                         if (t == netdev_priv(ign->fb_tunnel_dev))
1075                                 goto done;
1076                         dev = t->dev;
1077                 }
1078                 unregister_netdevice(dev);
1079                 err = 0;
1080                 break;
1081
1082         default:
1083                 err = -EINVAL;
1084         }
1085
1086 done:
1087         return err;
1088 }
1089
1090 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1091 {
1092         struct ip_tunnel *tunnel = netdev_priv(dev);
1093         if (new_mtu < 68 ||
1094             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1095                 return -EINVAL;
1096         dev->mtu = new_mtu;
1097         return 0;
1098 }
1099
1100 /* Nice toy. Unfortunately, useless in real life :-)
1101    It allows to construct virtual multiprotocol broadcast "LAN"
1102    over the Internet, provided multicast routing is tuned.
1103
1104
1105    I have no idea was this bicycle invented before me,
1106    so that I had to set ARPHRD_IPGRE to a random value.
1107    I have an impression, that Cisco could make something similar,
1108    but this feature is apparently missing in IOS<=11.2(8).
1109
1110    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1111    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1112
1113    ping -t 255 224.66.66.66
1114
1115    If nobody answers, mbone does not work.
1116
1117    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1118    ip addr add 10.66.66.<somewhat>/24 dev Universe
1119    ifconfig Universe up
1120    ifconfig Universe add fe80::<Your_real_addr>/10
1121    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1122    ftp 10.66.66.66
1123    ...
1124    ftp fec0:6666:6666::193.233.7.65
1125    ...
1126
1127  */
1128
1129 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1130                         unsigned short type,
1131                         const void *daddr, const void *saddr, unsigned len)
1132 {
1133         struct ip_tunnel *t = netdev_priv(dev);
1134         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1135         __be16 *p = (__be16*)(iph+1);
1136
1137         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1138         p[0]            = t->parms.o_flags;
1139         p[1]            = htons(type);
1140
1141         /*
1142          *      Set the source hardware address.
1143          */
1144
1145         if (saddr)
1146                 memcpy(&iph->saddr, saddr, 4);
1147
1148         if (daddr) {
1149                 memcpy(&iph->daddr, daddr, 4);
1150                 return t->hlen;
1151         }
1152         if (iph->daddr && !ipv4_is_multicast(iph->daddr))
1153                 return t->hlen;
1154
1155         return -t->hlen;
1156 }
1157
1158 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1159 {
1160         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1161         memcpy(haddr, &iph->saddr, 4);
1162         return 4;
1163 }
1164
1165 static const struct header_ops ipgre_header_ops = {
1166         .create = ipgre_header,
1167         .parse  = ipgre_header_parse,
1168 };
1169
1170 #ifdef CONFIG_NET_IPGRE_BROADCAST
1171 static int ipgre_open(struct net_device *dev)
1172 {
1173         struct ip_tunnel *t = netdev_priv(dev);
1174
1175         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1176                 struct flowi fl = { .oif = t->parms.link,
1177                                     .nl_u = { .ip4_u =
1178                                               { .daddr = t->parms.iph.daddr,
1179                                                 .saddr = t->parms.iph.saddr,
1180                                                 .tos = RT_TOS(t->parms.iph.tos) } },
1181                                     .proto = IPPROTO_GRE };
1182                 struct rtable *rt;
1183                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1184                         return -EADDRNOTAVAIL;
1185                 dev = rt->u.dst.dev;
1186                 ip_rt_put(rt);
1187                 if (__in_dev_get_rtnl(dev) == NULL)
1188                         return -EADDRNOTAVAIL;
1189                 t->mlink = dev->ifindex;
1190                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1191         }
1192         return 0;
1193 }
1194
1195 static int ipgre_close(struct net_device *dev)
1196 {
1197         struct ip_tunnel *t = netdev_priv(dev);
1198
1199         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1200                 struct in_device *in_dev;
1201                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1202                 if (in_dev) {
1203                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1204                         in_dev_put(in_dev);
1205                 }
1206         }
1207         return 0;
1208 }
1209
1210 #endif
1211
1212 static const struct net_device_ops ipgre_netdev_ops = {
1213         .ndo_init               = ipgre_tunnel_init,
1214         .ndo_uninit             = ipgre_tunnel_uninit,
1215 #ifdef CONFIG_NET_IPGRE_BROADCAST
1216         .ndo_open               = ipgre_open,
1217         .ndo_stop               = ipgre_close,
1218 #endif
1219         .ndo_start_xmit         = ipgre_tunnel_xmit,
1220         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1221         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1222 };
1223
1224 static void ipgre_tunnel_setup(struct net_device *dev)
1225 {
1226         dev->netdev_ops         = &ipgre_netdev_ops;
1227         dev->destructor         = free_netdev;
1228
1229         dev->type               = ARPHRD_IPGRE;
1230         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1231         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1232         dev->flags              = IFF_NOARP;
1233         dev->iflink             = 0;
1234         dev->addr_len           = 4;
1235         dev->features           |= NETIF_F_NETNS_LOCAL;
1236         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1237 }
1238
1239 static int ipgre_tunnel_init(struct net_device *dev)
1240 {
1241         struct ip_tunnel *tunnel;
1242         struct iphdr *iph;
1243
1244         tunnel = netdev_priv(dev);
1245         iph = &tunnel->parms.iph;
1246
1247         tunnel->dev = dev;
1248         strcpy(tunnel->parms.name, dev->name);
1249
1250         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1251         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1252
1253         if (iph->daddr) {
1254 #ifdef CONFIG_NET_IPGRE_BROADCAST
1255                 if (ipv4_is_multicast(iph->daddr)) {
1256                         if (!iph->saddr)
1257                                 return -EINVAL;
1258                         dev->flags = IFF_BROADCAST;
1259                         dev->header_ops = &ipgre_header_ops;
1260                 }
1261 #endif
1262         } else
1263                 dev->header_ops = &ipgre_header_ops;
1264
1265         return 0;
1266 }
1267
1268 static void ipgre_fb_tunnel_init(struct net_device *dev)
1269 {
1270         struct ip_tunnel *tunnel = netdev_priv(dev);
1271         struct iphdr *iph = &tunnel->parms.iph;
1272         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1273
1274         tunnel->dev = dev;
1275         strcpy(tunnel->parms.name, dev->name);
1276
1277         iph->version            = 4;
1278         iph->protocol           = IPPROTO_GRE;
1279         iph->ihl                = 5;
1280         tunnel->hlen            = sizeof(struct iphdr) + 4;
1281
1282         dev_hold(dev);
1283         ign->tunnels_wc[0]      = tunnel;
1284 }
1285
1286
1287 static const struct net_protocol ipgre_protocol = {
1288         .handler        =       ipgre_rcv,
1289         .err_handler    =       ipgre_err,
1290         .netns_ok       =       1,
1291 };
1292
1293 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1294 {
1295         int prio;
1296
1297         for (prio = 0; prio < 4; prio++) {
1298                 int h;
1299                 for (h = 0; h < HASH_SIZE; h++) {
1300                         struct ip_tunnel *t = ign->tunnels[prio][h];
1301
1302                         while (t != NULL) {
1303                                 unregister_netdevice_queue(t->dev, head);
1304                                 t = t->next;
1305                         }
1306                 }
1307         }
1308 }
1309
1310 static int ipgre_init_net(struct net *net)
1311 {
1312         int err;
1313         struct ipgre_net *ign;
1314
1315         err = -ENOMEM;
1316         ign = kzalloc(sizeof(struct ipgre_net), GFP_KERNEL);
1317         if (ign == NULL)
1318                 goto err_alloc;
1319
1320         err = net_assign_generic(net, ipgre_net_id, ign);
1321         if (err < 0)
1322                 goto err_assign;
1323
1324         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1325                                            ipgre_tunnel_setup);
1326         if (!ign->fb_tunnel_dev) {
1327                 err = -ENOMEM;
1328                 goto err_alloc_dev;
1329         }
1330         dev_net_set(ign->fb_tunnel_dev, net);
1331
1332         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1333         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1334
1335         if ((err = register_netdev(ign->fb_tunnel_dev)))
1336                 goto err_reg_dev;
1337
1338         return 0;
1339
1340 err_reg_dev:
1341         free_netdev(ign->fb_tunnel_dev);
1342 err_alloc_dev:
1343         /* nothing */
1344 err_assign:
1345         kfree(ign);
1346 err_alloc:
1347         return err;
1348 }
1349
1350 static void ipgre_exit_net(struct net *net)
1351 {
1352         struct ipgre_net *ign;
1353         LIST_HEAD(list);
1354
1355         ign = net_generic(net, ipgre_net_id);
1356         rtnl_lock();
1357         ipgre_destroy_tunnels(ign, &list);
1358         unregister_netdevice_many(&list);
1359         rtnl_unlock();
1360         kfree(ign);
1361 }
1362
1363 static struct pernet_operations ipgre_net_ops = {
1364         .init = ipgre_init_net,
1365         .exit = ipgre_exit_net,
1366 };
1367
1368 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1369 {
1370         __be16 flags;
1371
1372         if (!data)
1373                 return 0;
1374
1375         flags = 0;
1376         if (data[IFLA_GRE_IFLAGS])
1377                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1378         if (data[IFLA_GRE_OFLAGS])
1379                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1380         if (flags & (GRE_VERSION|GRE_ROUTING))
1381                 return -EINVAL;
1382
1383         return 0;
1384 }
1385
1386 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1387 {
1388         __be32 daddr;
1389
1390         if (tb[IFLA_ADDRESS]) {
1391                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1392                         return -EINVAL;
1393                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1394                         return -EADDRNOTAVAIL;
1395         }
1396
1397         if (!data)
1398                 goto out;
1399
1400         if (data[IFLA_GRE_REMOTE]) {
1401                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1402                 if (!daddr)
1403                         return -EINVAL;
1404         }
1405
1406 out:
1407         return ipgre_tunnel_validate(tb, data);
1408 }
1409
1410 static void ipgre_netlink_parms(struct nlattr *data[],
1411                                 struct ip_tunnel_parm *parms)
1412 {
1413         memset(parms, 0, sizeof(*parms));
1414
1415         parms->iph.protocol = IPPROTO_GRE;
1416
1417         if (!data)
1418                 return;
1419
1420         if (data[IFLA_GRE_LINK])
1421                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1422
1423         if (data[IFLA_GRE_IFLAGS])
1424                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1425
1426         if (data[IFLA_GRE_OFLAGS])
1427                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1428
1429         if (data[IFLA_GRE_IKEY])
1430                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1431
1432         if (data[IFLA_GRE_OKEY])
1433                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1434
1435         if (data[IFLA_GRE_LOCAL])
1436                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1437
1438         if (data[IFLA_GRE_REMOTE])
1439                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1440
1441         if (data[IFLA_GRE_TTL])
1442                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1443
1444         if (data[IFLA_GRE_TOS])
1445                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1446
1447         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1448                 parms->iph.frag_off = htons(IP_DF);
1449 }
1450
1451 static int ipgre_tap_init(struct net_device *dev)
1452 {
1453         struct ip_tunnel *tunnel;
1454
1455         tunnel = netdev_priv(dev);
1456
1457         tunnel->dev = dev;
1458         strcpy(tunnel->parms.name, dev->name);
1459
1460         ipgre_tunnel_bind_dev(dev);
1461
1462         return 0;
1463 }
1464
1465 static const struct net_device_ops ipgre_tap_netdev_ops = {
1466         .ndo_init               = ipgre_tap_init,
1467         .ndo_uninit             = ipgre_tunnel_uninit,
1468         .ndo_start_xmit         = ipgre_tunnel_xmit,
1469         .ndo_set_mac_address    = eth_mac_addr,
1470         .ndo_validate_addr      = eth_validate_addr,
1471         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1472 };
1473
1474 static void ipgre_tap_setup(struct net_device *dev)
1475 {
1476
1477         ether_setup(dev);
1478
1479         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1480         dev->destructor         = free_netdev;
1481
1482         dev->iflink             = 0;
1483         dev->features           |= NETIF_F_NETNS_LOCAL;
1484 }
1485
1486 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1487                          struct nlattr *data[])
1488 {
1489         struct ip_tunnel *nt;
1490         struct net *net = dev_net(dev);
1491         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1492         int mtu;
1493         int err;
1494
1495         nt = netdev_priv(dev);
1496         ipgre_netlink_parms(data, &nt->parms);
1497
1498         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1499                 return -EEXIST;
1500
1501         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1502                 random_ether_addr(dev->dev_addr);
1503
1504         mtu = ipgre_tunnel_bind_dev(dev);
1505         if (!tb[IFLA_MTU])
1506                 dev->mtu = mtu;
1507
1508         err = register_netdevice(dev);
1509         if (err)
1510                 goto out;
1511
1512         dev_hold(dev);
1513         ipgre_tunnel_link(ign, nt);
1514
1515 out:
1516         return err;
1517 }
1518
1519 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1520                             struct nlattr *data[])
1521 {
1522         struct ip_tunnel *t, *nt;
1523         struct net *net = dev_net(dev);
1524         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1525         struct ip_tunnel_parm p;
1526         int mtu;
1527
1528         if (dev == ign->fb_tunnel_dev)
1529                 return -EINVAL;
1530
1531         nt = netdev_priv(dev);
1532         ipgre_netlink_parms(data, &p);
1533
1534         t = ipgre_tunnel_locate(net, &p, 0);
1535
1536         if (t) {
1537                 if (t->dev != dev)
1538                         return -EEXIST;
1539         } else {
1540                 t = nt;
1541
1542                 if (dev->type != ARPHRD_ETHER) {
1543                         unsigned nflags = 0;
1544
1545                         if (ipv4_is_multicast(p.iph.daddr))
1546                                 nflags = IFF_BROADCAST;
1547                         else if (p.iph.daddr)
1548                                 nflags = IFF_POINTOPOINT;
1549
1550                         if ((dev->flags ^ nflags) &
1551                             (IFF_POINTOPOINT | IFF_BROADCAST))
1552                                 return -EINVAL;
1553                 }
1554
1555                 ipgre_tunnel_unlink(ign, t);
1556                 t->parms.iph.saddr = p.iph.saddr;
1557                 t->parms.iph.daddr = p.iph.daddr;
1558                 t->parms.i_key = p.i_key;
1559                 if (dev->type != ARPHRD_ETHER) {
1560                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1561                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1562                 }
1563                 ipgre_tunnel_link(ign, t);
1564                 netdev_state_change(dev);
1565         }
1566
1567         t->parms.o_key = p.o_key;
1568         t->parms.iph.ttl = p.iph.ttl;
1569         t->parms.iph.tos = p.iph.tos;
1570         t->parms.iph.frag_off = p.iph.frag_off;
1571
1572         if (t->parms.link != p.link) {
1573                 t->parms.link = p.link;
1574                 mtu = ipgre_tunnel_bind_dev(dev);
1575                 if (!tb[IFLA_MTU])
1576                         dev->mtu = mtu;
1577                 netdev_state_change(dev);
1578         }
1579
1580         return 0;
1581 }
1582
1583 static size_t ipgre_get_size(const struct net_device *dev)
1584 {
1585         return
1586                 /* IFLA_GRE_LINK */
1587                 nla_total_size(4) +
1588                 /* IFLA_GRE_IFLAGS */
1589                 nla_total_size(2) +
1590                 /* IFLA_GRE_OFLAGS */
1591                 nla_total_size(2) +
1592                 /* IFLA_GRE_IKEY */
1593                 nla_total_size(4) +
1594                 /* IFLA_GRE_OKEY */
1595                 nla_total_size(4) +
1596                 /* IFLA_GRE_LOCAL */
1597                 nla_total_size(4) +
1598                 /* IFLA_GRE_REMOTE */
1599                 nla_total_size(4) +
1600                 /* IFLA_GRE_TTL */
1601                 nla_total_size(1) +
1602                 /* IFLA_GRE_TOS */
1603                 nla_total_size(1) +
1604                 /* IFLA_GRE_PMTUDISC */
1605                 nla_total_size(1) +
1606                 0;
1607 }
1608
1609 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1610 {
1611         struct ip_tunnel *t = netdev_priv(dev);
1612         struct ip_tunnel_parm *p = &t->parms;
1613
1614         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1615         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1616         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1617         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1618         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1619         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1620         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1621         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1622         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1623         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1624
1625         return 0;
1626
1627 nla_put_failure:
1628         return -EMSGSIZE;
1629 }
1630
1631 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1632         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1633         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1634         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1635         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1636         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1637         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1638         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1639         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1640         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1641         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1642 };
1643
1644 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1645         .kind           = "gre",
1646         .maxtype        = IFLA_GRE_MAX,
1647         .policy         = ipgre_policy,
1648         .priv_size      = sizeof(struct ip_tunnel),
1649         .setup          = ipgre_tunnel_setup,
1650         .validate       = ipgre_tunnel_validate,
1651         .newlink        = ipgre_newlink,
1652         .changelink     = ipgre_changelink,
1653         .get_size       = ipgre_get_size,
1654         .fill_info      = ipgre_fill_info,
1655 };
1656
1657 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1658         .kind           = "gretap",
1659         .maxtype        = IFLA_GRE_MAX,
1660         .policy         = ipgre_policy,
1661         .priv_size      = sizeof(struct ip_tunnel),
1662         .setup          = ipgre_tap_setup,
1663         .validate       = ipgre_tap_validate,
1664         .newlink        = ipgre_newlink,
1665         .changelink     = ipgre_changelink,
1666         .get_size       = ipgre_get_size,
1667         .fill_info      = ipgre_fill_info,
1668 };
1669
1670 /*
1671  *      And now the modules code and kernel interface.
1672  */
1673
1674 static int __init ipgre_init(void)
1675 {
1676         int err;
1677
1678         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1679
1680         if (inet_add_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) {
1681                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1682                 return -EAGAIN;
1683         }
1684
1685         err = register_pernet_gen_device(&ipgre_net_id, &ipgre_net_ops);
1686         if (err < 0)
1687                 goto gen_device_failed;
1688
1689         err = rtnl_link_register(&ipgre_link_ops);
1690         if (err < 0)
1691                 goto rtnl_link_failed;
1692
1693         err = rtnl_link_register(&ipgre_tap_ops);
1694         if (err < 0)
1695                 goto tap_ops_failed;
1696
1697 out:
1698         return err;
1699
1700 tap_ops_failed:
1701         rtnl_link_unregister(&ipgre_link_ops);
1702 rtnl_link_failed:
1703         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1704 gen_device_failed:
1705         inet_del_protocol(&ipgre_protocol, IPPROTO_GRE);
1706         goto out;
1707 }
1708
1709 static void __exit ipgre_fini(void)
1710 {
1711         rtnl_link_unregister(&ipgre_tap_ops);
1712         rtnl_link_unregister(&ipgre_link_ops);
1713         unregister_pernet_gen_device(ipgre_net_id, &ipgre_net_ops);
1714         if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0)
1715                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1716 }
1717
1718 module_init(ipgre_init);
1719 module_exit(ipgre_fini);
1720 MODULE_LICENSE("GPL");
1721 MODULE_ALIAS_RTNL_LINK("gre");
1722 MODULE_ALIAS_RTNL_LINK("gretap");