]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv4/ip_gre.c
Merge branch 'for-2637/i2c/samsung' into next-i2c
[net-next-2.6.git] / net / ipv4 / ip_gre.c
1 /*
2  *      Linux NET3:     GRE over IP protocol decoder.
3  *
4  *      Authors: Alexey Kuznetsov (kuznet@ms2.inr.ac.ru)
5  *
6  *      This program is free software; you can redistribute it and/or
7  *      modify it under the terms of the GNU General Public License
8  *      as published by the Free Software Foundation; either version
9  *      2 of the License, or (at your option) any later version.
10  *
11  */
12
13 #include <linux/capability.h>
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/slab.h>
18 #include <asm/uaccess.h>
19 #include <linux/skbuff.h>
20 #include <linux/netdevice.h>
21 #include <linux/in.h>
22 #include <linux/tcp.h>
23 #include <linux/udp.h>
24 #include <linux/if_arp.h>
25 #include <linux/mroute.h>
26 #include <linux/init.h>
27 #include <linux/in6.h>
28 #include <linux/inetdevice.h>
29 #include <linux/igmp.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/etherdevice.h>
32 #include <linux/if_ether.h>
33
34 #include <net/sock.h>
35 #include <net/ip.h>
36 #include <net/icmp.h>
37 #include <net/protocol.h>
38 #include <net/ipip.h>
39 #include <net/arp.h>
40 #include <net/checksum.h>
41 #include <net/dsfield.h>
42 #include <net/inet_ecn.h>
43 #include <net/xfrm.h>
44 #include <net/net_namespace.h>
45 #include <net/netns/generic.h>
46 #include <net/rtnetlink.h>
47 #include <net/gre.h>
48
49 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
50 #include <net/ipv6.h>
51 #include <net/ip6_fib.h>
52 #include <net/ip6_route.h>
53 #endif
54
55 /*
56    Problems & solutions
57    --------------------
58
59    1. The most important issue is detecting local dead loops.
60    They would cause complete host lockup in transmit, which
61    would be "resolved" by stack overflow or, if queueing is enabled,
62    with infinite looping in net_bh.
63
64    We cannot track such dead loops during route installation,
65    it is infeasible task. The most general solutions would be
66    to keep skb->encapsulation counter (sort of local ttl),
67    and silently drop packet when it expires. It is a good
68    solution, but it supposes maintaing new variable in ALL
69    skb, even if no tunneling is used.
70
71    Current solution: xmit_recursion breaks dead loops. This is a percpu
72    counter, since when we enter the first ndo_xmit(), cpu migration is
73    forbidden. We force an exit if this counter reaches RECURSION_LIMIT
74
75    2. Networking dead loops would not kill routers, but would really
76    kill network. IP hop limit plays role of "t->recursion" in this case,
77    if we copy it from packet being encapsulated to upper header.
78    It is very good solution, but it introduces two problems:
79
80    - Routing protocols, using packets with ttl=1 (OSPF, RIP2),
81      do not work over tunnels.
82    - traceroute does not work. I planned to relay ICMP from tunnel,
83      so that this problem would be solved and traceroute output
84      would even more informative. This idea appeared to be wrong:
85      only Linux complies to rfc1812 now (yes, guys, Linux is the only
86      true router now :-)), all routers (at least, in neighbourhood of mine)
87      return only 8 bytes of payload. It is the end.
88
89    Hence, if we want that OSPF worked or traceroute said something reasonable,
90    we should search for another solution.
91
92    One of them is to parse packet trying to detect inner encapsulation
93    made by our node. It is difficult or even impossible, especially,
94    taking into account fragmentation. TO be short, tt is not solution at all.
95
96    Current solution: The solution was UNEXPECTEDLY SIMPLE.
97    We force DF flag on tunnels with preconfigured hop limit,
98    that is ALL. :-) Well, it does not remove the problem completely,
99    but exponential growth of network traffic is changed to linear
100    (branches, that exceed pmtu are pruned) and tunnel mtu
101    fastly degrades to value <68, where looping stops.
102    Yes, it is not good if there exists a router in the loop,
103    which does not force DF, even when encapsulating packets have DF set.
104    But it is not our problem! Nobody could accuse us, we made
105    all that we could make. Even if it is your gated who injected
106    fatal route to network, even if it were you who configured
107    fatal static route: you are innocent. :-)
108
109
110
111    3. Really, ipv4/ipip.c, ipv4/ip_gre.c and ipv6/sit.c contain
112    practically identical code. It would be good to glue them
113    together, but it is not very evident, how to make them modular.
114    sit is integral part of IPv6, ipip and gre are naturally modular.
115    We could extract common parts (hash table, ioctl etc)
116    to a separate module (ip_tunnel.c).
117
118    Alexey Kuznetsov.
119  */
120
121 static struct rtnl_link_ops ipgre_link_ops __read_mostly;
122 static int ipgre_tunnel_init(struct net_device *dev);
123 static void ipgre_tunnel_setup(struct net_device *dev);
124 static int ipgre_tunnel_bind_dev(struct net_device *dev);
125
126 /* Fallback tunnel: no source, no destination, no key, no options */
127
128 #define HASH_SIZE  16
129
130 static int ipgre_net_id __read_mostly;
131 struct ipgre_net {
132         struct ip_tunnel __rcu *tunnels[4][HASH_SIZE];
133
134         struct net_device *fb_tunnel_dev;
135 };
136
137 /* Tunnel hash table */
138
139 /*
140    4 hash tables:
141
142    3: (remote,local)
143    2: (remote,*)
144    1: (*,local)
145    0: (*,*)
146
147    We require exact key match i.e. if a key is present in packet
148    it will match only tunnel with the same key; if it is not present,
149    it will match only keyless tunnel.
150
151    All keysless packets, if not matched configured keyless tunnels
152    will match fallback tunnel.
153  */
154
155 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF)
156
157 #define tunnels_r_l     tunnels[3]
158 #define tunnels_r       tunnels[2]
159 #define tunnels_l       tunnels[1]
160 #define tunnels_wc      tunnels[0]
161 /*
162  * Locking : hash tables are protected by RCU and RTNL
163  */
164
165 #define for_each_ip_tunnel_rcu(start) \
166         for (t = rcu_dereference(start); t; t = rcu_dereference(t->next))
167
168 /* often modified stats are per cpu, other are shared (netdev->stats) */
169 struct pcpu_tstats {
170         unsigned long   rx_packets;
171         unsigned long   rx_bytes;
172         unsigned long   tx_packets;
173         unsigned long   tx_bytes;
174 };
175
176 static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
177 {
178         struct pcpu_tstats sum = { 0 };
179         int i;
180
181         for_each_possible_cpu(i) {
182                 const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i);
183
184                 sum.rx_packets += tstats->rx_packets;
185                 sum.rx_bytes   += tstats->rx_bytes;
186                 sum.tx_packets += tstats->tx_packets;
187                 sum.tx_bytes   += tstats->tx_bytes;
188         }
189         dev->stats.rx_packets = sum.rx_packets;
190         dev->stats.rx_bytes   = sum.rx_bytes;
191         dev->stats.tx_packets = sum.tx_packets;
192         dev->stats.tx_bytes   = sum.tx_bytes;
193         return &dev->stats;
194 }
195
196 /* Given src, dst and key, find appropriate for input tunnel. */
197
198 static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev,
199                                               __be32 remote, __be32 local,
200                                               __be32 key, __be16 gre_proto)
201 {
202         struct net *net = dev_net(dev);
203         int link = dev->ifindex;
204         unsigned int h0 = HASH(remote);
205         unsigned int h1 = HASH(key);
206         struct ip_tunnel *t, *cand = NULL;
207         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
208         int dev_type = (gre_proto == htons(ETH_P_TEB)) ?
209                        ARPHRD_ETHER : ARPHRD_IPGRE;
210         int score, cand_score = 4;
211
212         for_each_ip_tunnel_rcu(ign->tunnels_r_l[h0 ^ h1]) {
213                 if (local != t->parms.iph.saddr ||
214                     remote != t->parms.iph.daddr ||
215                     key != t->parms.i_key ||
216                     !(t->dev->flags & IFF_UP))
217                         continue;
218
219                 if (t->dev->type != ARPHRD_IPGRE &&
220                     t->dev->type != dev_type)
221                         continue;
222
223                 score = 0;
224                 if (t->parms.link != link)
225                         score |= 1;
226                 if (t->dev->type != dev_type)
227                         score |= 2;
228                 if (score == 0)
229                         return t;
230
231                 if (score < cand_score) {
232                         cand = t;
233                         cand_score = score;
234                 }
235         }
236
237         for_each_ip_tunnel_rcu(ign->tunnels_r[h0 ^ h1]) {
238                 if (remote != t->parms.iph.daddr ||
239                     key != t->parms.i_key ||
240                     !(t->dev->flags & IFF_UP))
241                         continue;
242
243                 if (t->dev->type != ARPHRD_IPGRE &&
244                     t->dev->type != dev_type)
245                         continue;
246
247                 score = 0;
248                 if (t->parms.link != link)
249                         score |= 1;
250                 if (t->dev->type != dev_type)
251                         score |= 2;
252                 if (score == 0)
253                         return t;
254
255                 if (score < cand_score) {
256                         cand = t;
257                         cand_score = score;
258                 }
259         }
260
261         for_each_ip_tunnel_rcu(ign->tunnels_l[h1]) {
262                 if ((local != t->parms.iph.saddr &&
263                      (local != t->parms.iph.daddr ||
264                       !ipv4_is_multicast(local))) ||
265                     key != t->parms.i_key ||
266                     !(t->dev->flags & IFF_UP))
267                         continue;
268
269                 if (t->dev->type != ARPHRD_IPGRE &&
270                     t->dev->type != dev_type)
271                         continue;
272
273                 score = 0;
274                 if (t->parms.link != link)
275                         score |= 1;
276                 if (t->dev->type != dev_type)
277                         score |= 2;
278                 if (score == 0)
279                         return t;
280
281                 if (score < cand_score) {
282                         cand = t;
283                         cand_score = score;
284                 }
285         }
286
287         for_each_ip_tunnel_rcu(ign->tunnels_wc[h1]) {
288                 if (t->parms.i_key != key ||
289                     !(t->dev->flags & IFF_UP))
290                         continue;
291
292                 if (t->dev->type != ARPHRD_IPGRE &&
293                     t->dev->type != dev_type)
294                         continue;
295
296                 score = 0;
297                 if (t->parms.link != link)
298                         score |= 1;
299                 if (t->dev->type != dev_type)
300                         score |= 2;
301                 if (score == 0)
302                         return t;
303
304                 if (score < cand_score) {
305                         cand = t;
306                         cand_score = score;
307                 }
308         }
309
310         if (cand != NULL)
311                 return cand;
312
313         dev = ign->fb_tunnel_dev;
314         if (dev->flags & IFF_UP)
315                 return netdev_priv(dev);
316
317         return NULL;
318 }
319
320 static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign,
321                 struct ip_tunnel_parm *parms)
322 {
323         __be32 remote = parms->iph.daddr;
324         __be32 local = parms->iph.saddr;
325         __be32 key = parms->i_key;
326         unsigned int h = HASH(key);
327         int prio = 0;
328
329         if (local)
330                 prio |= 1;
331         if (remote && !ipv4_is_multicast(remote)) {
332                 prio |= 2;
333                 h ^= HASH(remote);
334         }
335
336         return &ign->tunnels[prio][h];
337 }
338
339 static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign,
340                 struct ip_tunnel *t)
341 {
342         return __ipgre_bucket(ign, &t->parms);
343 }
344
345 static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t)
346 {
347         struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t);
348
349         rcu_assign_pointer(t->next, rtnl_dereference(*tp));
350         rcu_assign_pointer(*tp, t);
351 }
352
353 static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t)
354 {
355         struct ip_tunnel __rcu **tp;
356         struct ip_tunnel *iter;
357
358         for (tp = ipgre_bucket(ign, t);
359              (iter = rtnl_dereference(*tp)) != NULL;
360              tp = &iter->next) {
361                 if (t == iter) {
362                         rcu_assign_pointer(*tp, t->next);
363                         break;
364                 }
365         }
366 }
367
368 static struct ip_tunnel *ipgre_tunnel_find(struct net *net,
369                                            struct ip_tunnel_parm *parms,
370                                            int type)
371 {
372         __be32 remote = parms->iph.daddr;
373         __be32 local = parms->iph.saddr;
374         __be32 key = parms->i_key;
375         int link = parms->link;
376         struct ip_tunnel *t;
377         struct ip_tunnel __rcu **tp;
378         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
379
380         for (tp = __ipgre_bucket(ign, parms);
381              (t = rtnl_dereference(*tp)) != NULL;
382              tp = &t->next)
383                 if (local == t->parms.iph.saddr &&
384                     remote == t->parms.iph.daddr &&
385                     key == t->parms.i_key &&
386                     link == t->parms.link &&
387                     type == t->dev->type)
388                         break;
389
390         return t;
391 }
392
393 static struct ip_tunnel *ipgre_tunnel_locate(struct net *net,
394                 struct ip_tunnel_parm *parms, int create)
395 {
396         struct ip_tunnel *t, *nt;
397         struct net_device *dev;
398         char name[IFNAMSIZ];
399         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
400
401         t = ipgre_tunnel_find(net, parms, ARPHRD_IPGRE);
402         if (t || !create)
403                 return t;
404
405         if (parms->name[0])
406                 strlcpy(name, parms->name, IFNAMSIZ);
407         else
408                 sprintf(name, "gre%%d");
409
410         dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup);
411         if (!dev)
412           return NULL;
413
414         dev_net_set(dev, net);
415
416         if (strchr(name, '%')) {
417                 if (dev_alloc_name(dev, name) < 0)
418                         goto failed_free;
419         }
420
421         nt = netdev_priv(dev);
422         nt->parms = *parms;
423         dev->rtnl_link_ops = &ipgre_link_ops;
424
425         dev->mtu = ipgre_tunnel_bind_dev(dev);
426
427         if (register_netdevice(dev) < 0)
428                 goto failed_free;
429
430         dev_hold(dev);
431         ipgre_tunnel_link(ign, nt);
432         return nt;
433
434 failed_free:
435         free_netdev(dev);
436         return NULL;
437 }
438
439 static void ipgre_tunnel_uninit(struct net_device *dev)
440 {
441         struct net *net = dev_net(dev);
442         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
443
444         ipgre_tunnel_unlink(ign, netdev_priv(dev));
445         dev_put(dev);
446 }
447
448
449 static void ipgre_err(struct sk_buff *skb, u32 info)
450 {
451
452 /* All the routers (except for Linux) return only
453    8 bytes of packet payload. It means, that precise relaying of
454    ICMP in the real Internet is absolutely infeasible.
455
456    Moreover, Cisco "wise men" put GRE key to the third word
457    in GRE header. It makes impossible maintaining even soft state for keyed
458    GRE tunnels with enabled checksum. Tell them "thank you".
459
460    Well, I wonder, rfc1812 was written by Cisco employee,
461    what the hell these idiots break standrads established
462    by themself???
463  */
464
465         struct iphdr *iph = (struct iphdr *)skb->data;
466         __be16       *p = (__be16*)(skb->data+(iph->ihl<<2));
467         int grehlen = (iph->ihl<<2) + 4;
468         const int type = icmp_hdr(skb)->type;
469         const int code = icmp_hdr(skb)->code;
470         struct ip_tunnel *t;
471         __be16 flags;
472
473         flags = p[0];
474         if (flags&(GRE_CSUM|GRE_KEY|GRE_SEQ|GRE_ROUTING|GRE_VERSION)) {
475                 if (flags&(GRE_VERSION|GRE_ROUTING))
476                         return;
477                 if (flags&GRE_KEY) {
478                         grehlen += 4;
479                         if (flags&GRE_CSUM)
480                                 grehlen += 4;
481                 }
482         }
483
484         /* If only 8 bytes returned, keyed message will be dropped here */
485         if (skb_headlen(skb) < grehlen)
486                 return;
487
488         switch (type) {
489         default:
490         case ICMP_PARAMETERPROB:
491                 return;
492
493         case ICMP_DEST_UNREACH:
494                 switch (code) {
495                 case ICMP_SR_FAILED:
496                 case ICMP_PORT_UNREACH:
497                         /* Impossible event. */
498                         return;
499                 case ICMP_FRAG_NEEDED:
500                         /* Soft state for pmtu is maintained by IP core. */
501                         return;
502                 default:
503                         /* All others are translated to HOST_UNREACH.
504                            rfc2003 contains "deep thoughts" about NET_UNREACH,
505                            I believe they are just ether pollution. --ANK
506                          */
507                         break;
508                 }
509                 break;
510         case ICMP_TIME_EXCEEDED:
511                 if (code != ICMP_EXC_TTL)
512                         return;
513                 break;
514         }
515
516         rcu_read_lock();
517         t = ipgre_tunnel_lookup(skb->dev, iph->daddr, iph->saddr,
518                                 flags & GRE_KEY ?
519                                 *(((__be32 *)p) + (grehlen / 4) - 1) : 0,
520                                 p[1]);
521         if (t == NULL || t->parms.iph.daddr == 0 ||
522             ipv4_is_multicast(t->parms.iph.daddr))
523                 goto out;
524
525         if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
526                 goto out;
527
528         if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
529                 t->err_count++;
530         else
531                 t->err_count = 1;
532         t->err_time = jiffies;
533 out:
534         rcu_read_unlock();
535 }
536
537 static inline void ipgre_ecn_decapsulate(struct iphdr *iph, struct sk_buff *skb)
538 {
539         if (INET_ECN_is_ce(iph->tos)) {
540                 if (skb->protocol == htons(ETH_P_IP)) {
541                         IP_ECN_set_ce(ip_hdr(skb));
542                 } else if (skb->protocol == htons(ETH_P_IPV6)) {
543                         IP6_ECN_set_ce(ipv6_hdr(skb));
544                 }
545         }
546 }
547
548 static inline u8
549 ipgre_ecn_encapsulate(u8 tos, struct iphdr *old_iph, struct sk_buff *skb)
550 {
551         u8 inner = 0;
552         if (skb->protocol == htons(ETH_P_IP))
553                 inner = old_iph->tos;
554         else if (skb->protocol == htons(ETH_P_IPV6))
555                 inner = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
556         return INET_ECN_encapsulate(tos, inner);
557 }
558
559 static int ipgre_rcv(struct sk_buff *skb)
560 {
561         struct iphdr *iph;
562         u8     *h;
563         __be16    flags;
564         __sum16   csum = 0;
565         __be32 key = 0;
566         u32    seqno = 0;
567         struct ip_tunnel *tunnel;
568         int    offset = 4;
569         __be16 gre_proto;
570
571         if (!pskb_may_pull(skb, 16))
572                 goto drop_nolock;
573
574         iph = ip_hdr(skb);
575         h = skb->data;
576         flags = *(__be16*)h;
577
578         if (flags&(GRE_CSUM|GRE_KEY|GRE_ROUTING|GRE_SEQ|GRE_VERSION)) {
579                 /* - Version must be 0.
580                    - We do not support routing headers.
581                  */
582                 if (flags&(GRE_VERSION|GRE_ROUTING))
583                         goto drop_nolock;
584
585                 if (flags&GRE_CSUM) {
586                         switch (skb->ip_summed) {
587                         case CHECKSUM_COMPLETE:
588                                 csum = csum_fold(skb->csum);
589                                 if (!csum)
590                                         break;
591                                 /* fall through */
592                         case CHECKSUM_NONE:
593                                 skb->csum = 0;
594                                 csum = __skb_checksum_complete(skb);
595                                 skb->ip_summed = CHECKSUM_COMPLETE;
596                         }
597                         offset += 4;
598                 }
599                 if (flags&GRE_KEY) {
600                         key = *(__be32*)(h + offset);
601                         offset += 4;
602                 }
603                 if (flags&GRE_SEQ) {
604                         seqno = ntohl(*(__be32*)(h + offset));
605                         offset += 4;
606                 }
607         }
608
609         gre_proto = *(__be16 *)(h + 2);
610
611         rcu_read_lock();
612         if ((tunnel = ipgre_tunnel_lookup(skb->dev,
613                                           iph->saddr, iph->daddr, key,
614                                           gre_proto))) {
615                 struct pcpu_tstats *tstats;
616
617                 secpath_reset(skb);
618
619                 skb->protocol = gre_proto;
620                 /* WCCP version 1 and 2 protocol decoding.
621                  * - Change protocol to IP
622                  * - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
623                  */
624                 if (flags == 0 && gre_proto == htons(ETH_P_WCCP)) {
625                         skb->protocol = htons(ETH_P_IP);
626                         if ((*(h + offset) & 0xF0) != 0x40)
627                                 offset += 4;
628                 }
629
630                 skb->mac_header = skb->network_header;
631                 __pskb_pull(skb, offset);
632                 skb_postpull_rcsum(skb, skb_transport_header(skb), offset);
633                 skb->pkt_type = PACKET_HOST;
634 #ifdef CONFIG_NET_IPGRE_BROADCAST
635                 if (ipv4_is_multicast(iph->daddr)) {
636                         /* Looped back packet, drop it! */
637                         if (skb_rtable(skb)->fl.iif == 0)
638                                 goto drop;
639                         tunnel->dev->stats.multicast++;
640                         skb->pkt_type = PACKET_BROADCAST;
641                 }
642 #endif
643
644                 if (((flags&GRE_CSUM) && csum) ||
645                     (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) {
646                         tunnel->dev->stats.rx_crc_errors++;
647                         tunnel->dev->stats.rx_errors++;
648                         goto drop;
649                 }
650                 if (tunnel->parms.i_flags&GRE_SEQ) {
651                         if (!(flags&GRE_SEQ) ||
652                             (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) {
653                                 tunnel->dev->stats.rx_fifo_errors++;
654                                 tunnel->dev->stats.rx_errors++;
655                                 goto drop;
656                         }
657                         tunnel->i_seqno = seqno + 1;
658                 }
659
660                 /* Warning: All skb pointers will be invalidated! */
661                 if (tunnel->dev->type == ARPHRD_ETHER) {
662                         if (!pskb_may_pull(skb, ETH_HLEN)) {
663                                 tunnel->dev->stats.rx_length_errors++;
664                                 tunnel->dev->stats.rx_errors++;
665                                 goto drop;
666                         }
667
668                         iph = ip_hdr(skb);
669                         skb->protocol = eth_type_trans(skb, tunnel->dev);
670                         skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
671                 }
672
673                 tstats = this_cpu_ptr(tunnel->dev->tstats);
674                 tstats->rx_packets++;
675                 tstats->rx_bytes += skb->len;
676
677                 __skb_tunnel_rx(skb, tunnel->dev);
678
679                 skb_reset_network_header(skb);
680                 ipgre_ecn_decapsulate(iph, skb);
681
682                 netif_rx(skb);
683
684                 rcu_read_unlock();
685                 return 0;
686         }
687         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
688
689 drop:
690         rcu_read_unlock();
691 drop_nolock:
692         kfree_skb(skb);
693         return 0;
694 }
695
696 static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
697 {
698         struct ip_tunnel *tunnel = netdev_priv(dev);
699         struct pcpu_tstats *tstats;
700         struct iphdr  *old_iph = ip_hdr(skb);
701         struct iphdr  *tiph;
702         u8     tos;
703         __be16 df;
704         struct rtable *rt;                      /* Route to the other host */
705         struct net_device *tdev;                /* Device to other host */
706         struct iphdr  *iph;                     /* Our new IP header */
707         unsigned int max_headroom;              /* The extra header space needed */
708         int    gre_hlen;
709         __be32 dst;
710         int    mtu;
711
712         if (dev->type == ARPHRD_ETHER)
713                 IPCB(skb)->flags = 0;
714
715         if (dev->header_ops && dev->type == ARPHRD_IPGRE) {
716                 gre_hlen = 0;
717                 tiph = (struct iphdr *)skb->data;
718         } else {
719                 gre_hlen = tunnel->hlen;
720                 tiph = &tunnel->parms.iph;
721         }
722
723         if ((dst = tiph->daddr) == 0) {
724                 /* NBMA tunnel */
725
726                 if (skb_dst(skb) == NULL) {
727                         dev->stats.tx_fifo_errors++;
728                         goto tx_error;
729                 }
730
731                 if (skb->protocol == htons(ETH_P_IP)) {
732                         rt = skb_rtable(skb);
733                         if ((dst = rt->rt_gateway) == 0)
734                                 goto tx_error_icmp;
735                 }
736 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
737                 else if (skb->protocol == htons(ETH_P_IPV6)) {
738                         struct in6_addr *addr6;
739                         int addr_type;
740                         struct neighbour *neigh = skb_dst(skb)->neighbour;
741
742                         if (neigh == NULL)
743                                 goto tx_error;
744
745                         addr6 = (struct in6_addr *)&neigh->primary_key;
746                         addr_type = ipv6_addr_type(addr6);
747
748                         if (addr_type == IPV6_ADDR_ANY) {
749                                 addr6 = &ipv6_hdr(skb)->daddr;
750                                 addr_type = ipv6_addr_type(addr6);
751                         }
752
753                         if ((addr_type & IPV6_ADDR_COMPATv4) == 0)
754                                 goto tx_error_icmp;
755
756                         dst = addr6->s6_addr32[3];
757                 }
758 #endif
759                 else
760                         goto tx_error;
761         }
762
763         tos = tiph->tos;
764         if (tos == 1) {
765                 tos = 0;
766                 if (skb->protocol == htons(ETH_P_IP))
767                         tos = old_iph->tos;
768                 else if (skb->protocol == htons(ETH_P_IPV6))
769                         tos = ipv6_get_dsfield((struct ipv6hdr *)old_iph);
770         }
771
772         {
773                 struct flowi fl = {
774                         .oif = tunnel->parms.link,
775                         .nl_u = {
776                                 .ip4_u = {
777                                         .daddr = dst,
778                                         .saddr = tiph->saddr,
779                                         .tos = RT_TOS(tos)
780                                 }
781                         },
782                         .proto = IPPROTO_GRE
783                 }
784 ;
785                 if (ip_route_output_key(dev_net(dev), &rt, &fl)) {
786                         dev->stats.tx_carrier_errors++;
787                         goto tx_error;
788                 }
789         }
790         tdev = rt->dst.dev;
791
792         if (tdev == dev) {
793                 ip_rt_put(rt);
794                 dev->stats.collisions++;
795                 goto tx_error;
796         }
797
798         df = tiph->frag_off;
799         if (df)
800                 mtu = dst_mtu(&rt->dst) - dev->hard_header_len - tunnel->hlen;
801         else
802                 mtu = skb_dst(skb) ? dst_mtu(skb_dst(skb)) : dev->mtu;
803
804         if (skb_dst(skb))
805                 skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu);
806
807         if (skb->protocol == htons(ETH_P_IP)) {
808                 df |= (old_iph->frag_off&htons(IP_DF));
809
810                 if ((old_iph->frag_off&htons(IP_DF)) &&
811                     mtu < ntohs(old_iph->tot_len)) {
812                         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
813                         ip_rt_put(rt);
814                         goto tx_error;
815                 }
816         }
817 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
818         else if (skb->protocol == htons(ETH_P_IPV6)) {
819                 struct rt6_info *rt6 = (struct rt6_info *)skb_dst(skb);
820
821                 if (rt6 && mtu < dst_mtu(skb_dst(skb)) && mtu >= IPV6_MIN_MTU) {
822                         if ((tunnel->parms.iph.daddr &&
823                              !ipv4_is_multicast(tunnel->parms.iph.daddr)) ||
824                             rt6->rt6i_dst.plen == 128) {
825                                 rt6->rt6i_flags |= RTF_MODIFIED;
826                                 skb_dst(skb)->metrics[RTAX_MTU-1] = mtu;
827                         }
828                 }
829
830                 if (mtu >= IPV6_MIN_MTU && mtu < skb->len - tunnel->hlen + gre_hlen) {
831                         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
832                         ip_rt_put(rt);
833                         goto tx_error;
834                 }
835         }
836 #endif
837
838         if (tunnel->err_count > 0) {
839                 if (time_before(jiffies,
840                                 tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
841                         tunnel->err_count--;
842
843                         dst_link_failure(skb);
844                 } else
845                         tunnel->err_count = 0;
846         }
847
848         max_headroom = LL_RESERVED_SPACE(tdev) + gre_hlen + rt->dst.header_len;
849
850         if (skb_headroom(skb) < max_headroom || skb_shared(skb)||
851             (skb_cloned(skb) && !skb_clone_writable(skb, 0))) {
852                 struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom);
853                 if (max_headroom > dev->needed_headroom)
854                         dev->needed_headroom = max_headroom;
855                 if (!new_skb) {
856                         ip_rt_put(rt);
857                         dev->stats.tx_dropped++;
858                         dev_kfree_skb(skb);
859                         return NETDEV_TX_OK;
860                 }
861                 if (skb->sk)
862                         skb_set_owner_w(new_skb, skb->sk);
863                 dev_kfree_skb(skb);
864                 skb = new_skb;
865                 old_iph = ip_hdr(skb);
866         }
867
868         skb_reset_transport_header(skb);
869         skb_push(skb, gre_hlen);
870         skb_reset_network_header(skb);
871         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
872         IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
873                               IPSKB_REROUTED);
874         skb_dst_drop(skb);
875         skb_dst_set(skb, &rt->dst);
876
877         /*
878          *      Push down and install the IPIP header.
879          */
880
881         iph                     =       ip_hdr(skb);
882         iph->version            =       4;
883         iph->ihl                =       sizeof(struct iphdr) >> 2;
884         iph->frag_off           =       df;
885         iph->protocol           =       IPPROTO_GRE;
886         iph->tos                =       ipgre_ecn_encapsulate(tos, old_iph, skb);
887         iph->daddr              =       rt->rt_dst;
888         iph->saddr              =       rt->rt_src;
889
890         if ((iph->ttl = tiph->ttl) == 0) {
891                 if (skb->protocol == htons(ETH_P_IP))
892                         iph->ttl = old_iph->ttl;
893 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
894                 else if (skb->protocol == htons(ETH_P_IPV6))
895                         iph->ttl = ((struct ipv6hdr *)old_iph)->hop_limit;
896 #endif
897                 else
898                         iph->ttl = dst_metric(&rt->dst, RTAX_HOPLIMIT);
899         }
900
901         ((__be16 *)(iph + 1))[0] = tunnel->parms.o_flags;
902         ((__be16 *)(iph + 1))[1] = (dev->type == ARPHRD_ETHER) ?
903                                    htons(ETH_P_TEB) : skb->protocol;
904
905         if (tunnel->parms.o_flags&(GRE_KEY|GRE_CSUM|GRE_SEQ)) {
906                 __be32 *ptr = (__be32*)(((u8*)iph) + tunnel->hlen - 4);
907
908                 if (tunnel->parms.o_flags&GRE_SEQ) {
909                         ++tunnel->o_seqno;
910                         *ptr = htonl(tunnel->o_seqno);
911                         ptr--;
912                 }
913                 if (tunnel->parms.o_flags&GRE_KEY) {
914                         *ptr = tunnel->parms.o_key;
915                         ptr--;
916                 }
917                 if (tunnel->parms.o_flags&GRE_CSUM) {
918                         *ptr = 0;
919                         *(__sum16*)ptr = ip_compute_csum((void*)(iph+1), skb->len - sizeof(struct iphdr));
920                 }
921         }
922
923         nf_reset(skb);
924         tstats = this_cpu_ptr(dev->tstats);
925         __IPTUNNEL_XMIT(tstats, &dev->stats);
926         return NETDEV_TX_OK;
927
928 tx_error_icmp:
929         dst_link_failure(skb);
930
931 tx_error:
932         dev->stats.tx_errors++;
933         dev_kfree_skb(skb);
934         return NETDEV_TX_OK;
935 }
936
937 static int ipgre_tunnel_bind_dev(struct net_device *dev)
938 {
939         struct net_device *tdev = NULL;
940         struct ip_tunnel *tunnel;
941         struct iphdr *iph;
942         int hlen = LL_MAX_HEADER;
943         int mtu = ETH_DATA_LEN;
944         int addend = sizeof(struct iphdr) + 4;
945
946         tunnel = netdev_priv(dev);
947         iph = &tunnel->parms.iph;
948
949         /* Guess output device to choose reasonable mtu and needed_headroom */
950
951         if (iph->daddr) {
952                 struct flowi fl = {
953                         .oif = tunnel->parms.link,
954                         .nl_u = {
955                                 .ip4_u = {
956                                         .daddr = iph->daddr,
957                                         .saddr = iph->saddr,
958                                         .tos = RT_TOS(iph->tos)
959                                 }
960                         },
961                         .proto = IPPROTO_GRE
962                 };
963                 struct rtable *rt;
964
965                 if (!ip_route_output_key(dev_net(dev), &rt, &fl)) {
966                         tdev = rt->dst.dev;
967                         ip_rt_put(rt);
968                 }
969
970                 if (dev->type != ARPHRD_ETHER)
971                         dev->flags |= IFF_POINTOPOINT;
972         }
973
974         if (!tdev && tunnel->parms.link)
975                 tdev = __dev_get_by_index(dev_net(dev), tunnel->parms.link);
976
977         if (tdev) {
978                 hlen = tdev->hard_header_len + tdev->needed_headroom;
979                 mtu = tdev->mtu;
980         }
981         dev->iflink = tunnel->parms.link;
982
983         /* Precalculate GRE options length */
984         if (tunnel->parms.o_flags&(GRE_CSUM|GRE_KEY|GRE_SEQ)) {
985                 if (tunnel->parms.o_flags&GRE_CSUM)
986                         addend += 4;
987                 if (tunnel->parms.o_flags&GRE_KEY)
988                         addend += 4;
989                 if (tunnel->parms.o_flags&GRE_SEQ)
990                         addend += 4;
991         }
992         dev->needed_headroom = addend + hlen;
993         mtu -= dev->hard_header_len + addend;
994
995         if (mtu < 68)
996                 mtu = 68;
997
998         tunnel->hlen = addend;
999
1000         return mtu;
1001 }
1002
1003 static int
1004 ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd)
1005 {
1006         int err = 0;
1007         struct ip_tunnel_parm p;
1008         struct ip_tunnel *t;
1009         struct net *net = dev_net(dev);
1010         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1011
1012         switch (cmd) {
1013         case SIOCGETTUNNEL:
1014                 t = NULL;
1015                 if (dev == ign->fb_tunnel_dev) {
1016                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p))) {
1017                                 err = -EFAULT;
1018                                 break;
1019                         }
1020                         t = ipgre_tunnel_locate(net, &p, 0);
1021                 }
1022                 if (t == NULL)
1023                         t = netdev_priv(dev);
1024                 memcpy(&p, &t->parms, sizeof(p));
1025                 if (copy_to_user(ifr->ifr_ifru.ifru_data, &p, sizeof(p)))
1026                         err = -EFAULT;
1027                 break;
1028
1029         case SIOCADDTUNNEL:
1030         case SIOCCHGTUNNEL:
1031                 err = -EPERM;
1032                 if (!capable(CAP_NET_ADMIN))
1033                         goto done;
1034
1035                 err = -EFAULT;
1036                 if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1037                         goto done;
1038
1039                 err = -EINVAL;
1040                 if (p.iph.version != 4 || p.iph.protocol != IPPROTO_GRE ||
1041                     p.iph.ihl != 5 || (p.iph.frag_off&htons(~IP_DF)) ||
1042                     ((p.i_flags|p.o_flags)&(GRE_VERSION|GRE_ROUTING)))
1043                         goto done;
1044                 if (p.iph.ttl)
1045                         p.iph.frag_off |= htons(IP_DF);
1046
1047                 if (!(p.i_flags&GRE_KEY))
1048                         p.i_key = 0;
1049                 if (!(p.o_flags&GRE_KEY))
1050                         p.o_key = 0;
1051
1052                 t = ipgre_tunnel_locate(net, &p, cmd == SIOCADDTUNNEL);
1053
1054                 if (dev != ign->fb_tunnel_dev && cmd == SIOCCHGTUNNEL) {
1055                         if (t != NULL) {
1056                                 if (t->dev != dev) {
1057                                         err = -EEXIST;
1058                                         break;
1059                                 }
1060                         } else {
1061                                 unsigned int nflags = 0;
1062
1063                                 t = netdev_priv(dev);
1064
1065                                 if (ipv4_is_multicast(p.iph.daddr))
1066                                         nflags = IFF_BROADCAST;
1067                                 else if (p.iph.daddr)
1068                                         nflags = IFF_POINTOPOINT;
1069
1070                                 if ((dev->flags^nflags)&(IFF_POINTOPOINT|IFF_BROADCAST)) {
1071                                         err = -EINVAL;
1072                                         break;
1073                                 }
1074                                 ipgre_tunnel_unlink(ign, t);
1075                                 t->parms.iph.saddr = p.iph.saddr;
1076                                 t->parms.iph.daddr = p.iph.daddr;
1077                                 t->parms.i_key = p.i_key;
1078                                 t->parms.o_key = p.o_key;
1079                                 memcpy(dev->dev_addr, &p.iph.saddr, 4);
1080                                 memcpy(dev->broadcast, &p.iph.daddr, 4);
1081                                 ipgre_tunnel_link(ign, t);
1082                                 netdev_state_change(dev);
1083                         }
1084                 }
1085
1086                 if (t) {
1087                         err = 0;
1088                         if (cmd == SIOCCHGTUNNEL) {
1089                                 t->parms.iph.ttl = p.iph.ttl;
1090                                 t->parms.iph.tos = p.iph.tos;
1091                                 t->parms.iph.frag_off = p.iph.frag_off;
1092                                 if (t->parms.link != p.link) {
1093                                         t->parms.link = p.link;
1094                                         dev->mtu = ipgre_tunnel_bind_dev(dev);
1095                                         netdev_state_change(dev);
1096                                 }
1097                         }
1098                         if (copy_to_user(ifr->ifr_ifru.ifru_data, &t->parms, sizeof(p)))
1099                                 err = -EFAULT;
1100                 } else
1101                         err = (cmd == SIOCADDTUNNEL ? -ENOBUFS : -ENOENT);
1102                 break;
1103
1104         case SIOCDELTUNNEL:
1105                 err = -EPERM;
1106                 if (!capable(CAP_NET_ADMIN))
1107                         goto done;
1108
1109                 if (dev == ign->fb_tunnel_dev) {
1110                         err = -EFAULT;
1111                         if (copy_from_user(&p, ifr->ifr_ifru.ifru_data, sizeof(p)))
1112                                 goto done;
1113                         err = -ENOENT;
1114                         if ((t = ipgre_tunnel_locate(net, &p, 0)) == NULL)
1115                                 goto done;
1116                         err = -EPERM;
1117                         if (t == netdev_priv(ign->fb_tunnel_dev))
1118                                 goto done;
1119                         dev = t->dev;
1120                 }
1121                 unregister_netdevice(dev);
1122                 err = 0;
1123                 break;
1124
1125         default:
1126                 err = -EINVAL;
1127         }
1128
1129 done:
1130         return err;
1131 }
1132
1133 static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu)
1134 {
1135         struct ip_tunnel *tunnel = netdev_priv(dev);
1136         if (new_mtu < 68 ||
1137             new_mtu > 0xFFF8 - dev->hard_header_len - tunnel->hlen)
1138                 return -EINVAL;
1139         dev->mtu = new_mtu;
1140         return 0;
1141 }
1142
1143 /* Nice toy. Unfortunately, useless in real life :-)
1144    It allows to construct virtual multiprotocol broadcast "LAN"
1145    over the Internet, provided multicast routing is tuned.
1146
1147
1148    I have no idea was this bicycle invented before me,
1149    so that I had to set ARPHRD_IPGRE to a random value.
1150    I have an impression, that Cisco could make something similar,
1151    but this feature is apparently missing in IOS<=11.2(8).
1152
1153    I set up 10.66.66/24 and fec0:6666:6666::0/96 as virtual networks
1154    with broadcast 224.66.66.66. If you have access to mbone, play with me :-)
1155
1156    ping -t 255 224.66.66.66
1157
1158    If nobody answers, mbone does not work.
1159
1160    ip tunnel add Universe mode gre remote 224.66.66.66 local <Your_real_addr> ttl 255
1161    ip addr add 10.66.66.<somewhat>/24 dev Universe
1162    ifconfig Universe up
1163    ifconfig Universe add fe80::<Your_real_addr>/10
1164    ifconfig Universe add fec0:6666:6666::<Your_real_addr>/96
1165    ftp 10.66.66.66
1166    ...
1167    ftp fec0:6666:6666::193.233.7.65
1168    ...
1169
1170  */
1171
1172 static int ipgre_header(struct sk_buff *skb, struct net_device *dev,
1173                         unsigned short type,
1174                         const void *daddr, const void *saddr, unsigned int len)
1175 {
1176         struct ip_tunnel *t = netdev_priv(dev);
1177         struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen);
1178         __be16 *p = (__be16*)(iph+1);
1179
1180         memcpy(iph, &t->parms.iph, sizeof(struct iphdr));
1181         p[0]            = t->parms.o_flags;
1182         p[1]            = htons(type);
1183
1184         /*
1185          *      Set the source hardware address.
1186          */
1187
1188         if (saddr)
1189                 memcpy(&iph->saddr, saddr, 4);
1190         if (daddr)
1191                 memcpy(&iph->daddr, daddr, 4);
1192         if (iph->daddr)
1193                 return t->hlen;
1194
1195         return -t->hlen;
1196 }
1197
1198 static int ipgre_header_parse(const struct sk_buff *skb, unsigned char *haddr)
1199 {
1200         struct iphdr *iph = (struct iphdr *) skb_mac_header(skb);
1201         memcpy(haddr, &iph->saddr, 4);
1202         return 4;
1203 }
1204
1205 static const struct header_ops ipgre_header_ops = {
1206         .create = ipgre_header,
1207         .parse  = ipgre_header_parse,
1208 };
1209
1210 #ifdef CONFIG_NET_IPGRE_BROADCAST
1211 static int ipgre_open(struct net_device *dev)
1212 {
1213         struct ip_tunnel *t = netdev_priv(dev);
1214
1215         if (ipv4_is_multicast(t->parms.iph.daddr)) {
1216                 struct flowi fl = {
1217                         .oif = t->parms.link,
1218                         .nl_u = {
1219                                 .ip4_u = {
1220                                         .daddr = t->parms.iph.daddr,
1221                                         .saddr = t->parms.iph.saddr,
1222                                         .tos = RT_TOS(t->parms.iph.tos)
1223                                 }
1224                         },
1225                         .proto = IPPROTO_GRE
1226                 };
1227                 struct rtable *rt;
1228
1229                 if (ip_route_output_key(dev_net(dev), &rt, &fl))
1230                         return -EADDRNOTAVAIL;
1231                 dev = rt->dst.dev;
1232                 ip_rt_put(rt);
1233                 if (__in_dev_get_rtnl(dev) == NULL)
1234                         return -EADDRNOTAVAIL;
1235                 t->mlink = dev->ifindex;
1236                 ip_mc_inc_group(__in_dev_get_rtnl(dev), t->parms.iph.daddr);
1237         }
1238         return 0;
1239 }
1240
1241 static int ipgre_close(struct net_device *dev)
1242 {
1243         struct ip_tunnel *t = netdev_priv(dev);
1244
1245         if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) {
1246                 struct in_device *in_dev;
1247                 in_dev = inetdev_by_index(dev_net(dev), t->mlink);
1248                 if (in_dev)
1249                         ip_mc_dec_group(in_dev, t->parms.iph.daddr);
1250         }
1251         return 0;
1252 }
1253
1254 #endif
1255
1256 static const struct net_device_ops ipgre_netdev_ops = {
1257         .ndo_init               = ipgre_tunnel_init,
1258         .ndo_uninit             = ipgre_tunnel_uninit,
1259 #ifdef CONFIG_NET_IPGRE_BROADCAST
1260         .ndo_open               = ipgre_open,
1261         .ndo_stop               = ipgre_close,
1262 #endif
1263         .ndo_start_xmit         = ipgre_tunnel_xmit,
1264         .ndo_do_ioctl           = ipgre_tunnel_ioctl,
1265         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1266         .ndo_get_stats          = ipgre_get_stats,
1267 };
1268
1269 static void ipgre_dev_free(struct net_device *dev)
1270 {
1271         free_percpu(dev->tstats);
1272         free_netdev(dev);
1273 }
1274
1275 static void ipgre_tunnel_setup(struct net_device *dev)
1276 {
1277         dev->netdev_ops         = &ipgre_netdev_ops;
1278         dev->destructor         = ipgre_dev_free;
1279
1280         dev->type               = ARPHRD_IPGRE;
1281         dev->needed_headroom    = LL_MAX_HEADER + sizeof(struct iphdr) + 4;
1282         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 4;
1283         dev->flags              = IFF_NOARP;
1284         dev->iflink             = 0;
1285         dev->addr_len           = 4;
1286         dev->features           |= NETIF_F_NETNS_LOCAL;
1287         dev->priv_flags         &= ~IFF_XMIT_DST_RELEASE;
1288 }
1289
1290 static int ipgre_tunnel_init(struct net_device *dev)
1291 {
1292         struct ip_tunnel *tunnel;
1293         struct iphdr *iph;
1294
1295         tunnel = netdev_priv(dev);
1296         iph = &tunnel->parms.iph;
1297
1298         tunnel->dev = dev;
1299         strcpy(tunnel->parms.name, dev->name);
1300
1301         memcpy(dev->dev_addr, &tunnel->parms.iph.saddr, 4);
1302         memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4);
1303
1304         if (iph->daddr) {
1305 #ifdef CONFIG_NET_IPGRE_BROADCAST
1306                 if (ipv4_is_multicast(iph->daddr)) {
1307                         if (!iph->saddr)
1308                                 return -EINVAL;
1309                         dev->flags = IFF_BROADCAST;
1310                         dev->header_ops = &ipgre_header_ops;
1311                 }
1312 #endif
1313         } else
1314                 dev->header_ops = &ipgre_header_ops;
1315
1316         dev->tstats = alloc_percpu(struct pcpu_tstats);
1317         if (!dev->tstats)
1318                 return -ENOMEM;
1319
1320         return 0;
1321 }
1322
1323 static void ipgre_fb_tunnel_init(struct net_device *dev)
1324 {
1325         struct ip_tunnel *tunnel = netdev_priv(dev);
1326         struct iphdr *iph = &tunnel->parms.iph;
1327         struct ipgre_net *ign = net_generic(dev_net(dev), ipgre_net_id);
1328
1329         tunnel->dev = dev;
1330         strcpy(tunnel->parms.name, dev->name);
1331
1332         iph->version            = 4;
1333         iph->protocol           = IPPROTO_GRE;
1334         iph->ihl                = 5;
1335         tunnel->hlen            = sizeof(struct iphdr) + 4;
1336
1337         dev_hold(dev);
1338         rcu_assign_pointer(ign->tunnels_wc[0], tunnel);
1339 }
1340
1341
1342 static const struct gre_protocol ipgre_protocol = {
1343         .handler     = ipgre_rcv,
1344         .err_handler = ipgre_err,
1345 };
1346
1347 static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head)
1348 {
1349         int prio;
1350
1351         for (prio = 0; prio < 4; prio++) {
1352                 int h;
1353                 for (h = 0; h < HASH_SIZE; h++) {
1354                         struct ip_tunnel *t;
1355
1356                         t = rtnl_dereference(ign->tunnels[prio][h]);
1357
1358                         while (t != NULL) {
1359                                 unregister_netdevice_queue(t->dev, head);
1360                                 t = rtnl_dereference(t->next);
1361                         }
1362                 }
1363         }
1364 }
1365
1366 static int __net_init ipgre_init_net(struct net *net)
1367 {
1368         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1369         int err;
1370
1371         ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "gre0",
1372                                            ipgre_tunnel_setup);
1373         if (!ign->fb_tunnel_dev) {
1374                 err = -ENOMEM;
1375                 goto err_alloc_dev;
1376         }
1377         dev_net_set(ign->fb_tunnel_dev, net);
1378
1379         ipgre_fb_tunnel_init(ign->fb_tunnel_dev);
1380         ign->fb_tunnel_dev->rtnl_link_ops = &ipgre_link_ops;
1381
1382         if ((err = register_netdev(ign->fb_tunnel_dev)))
1383                 goto err_reg_dev;
1384
1385         return 0;
1386
1387 err_reg_dev:
1388         free_netdev(ign->fb_tunnel_dev);
1389 err_alloc_dev:
1390         return err;
1391 }
1392
1393 static void __net_exit ipgre_exit_net(struct net *net)
1394 {
1395         struct ipgre_net *ign;
1396         LIST_HEAD(list);
1397
1398         ign = net_generic(net, ipgre_net_id);
1399         rtnl_lock();
1400         ipgre_destroy_tunnels(ign, &list);
1401         unregister_netdevice_many(&list);
1402         rtnl_unlock();
1403 }
1404
1405 static struct pernet_operations ipgre_net_ops = {
1406         .init = ipgre_init_net,
1407         .exit = ipgre_exit_net,
1408         .id   = &ipgre_net_id,
1409         .size = sizeof(struct ipgre_net),
1410 };
1411
1412 static int ipgre_tunnel_validate(struct nlattr *tb[], struct nlattr *data[])
1413 {
1414         __be16 flags;
1415
1416         if (!data)
1417                 return 0;
1418
1419         flags = 0;
1420         if (data[IFLA_GRE_IFLAGS])
1421                 flags |= nla_get_be16(data[IFLA_GRE_IFLAGS]);
1422         if (data[IFLA_GRE_OFLAGS])
1423                 flags |= nla_get_be16(data[IFLA_GRE_OFLAGS]);
1424         if (flags & (GRE_VERSION|GRE_ROUTING))
1425                 return -EINVAL;
1426
1427         return 0;
1428 }
1429
1430 static int ipgre_tap_validate(struct nlattr *tb[], struct nlattr *data[])
1431 {
1432         __be32 daddr;
1433
1434         if (tb[IFLA_ADDRESS]) {
1435                 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
1436                         return -EINVAL;
1437                 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
1438                         return -EADDRNOTAVAIL;
1439         }
1440
1441         if (!data)
1442                 goto out;
1443
1444         if (data[IFLA_GRE_REMOTE]) {
1445                 memcpy(&daddr, nla_data(data[IFLA_GRE_REMOTE]), 4);
1446                 if (!daddr)
1447                         return -EINVAL;
1448         }
1449
1450 out:
1451         return ipgre_tunnel_validate(tb, data);
1452 }
1453
1454 static void ipgre_netlink_parms(struct nlattr *data[],
1455                                 struct ip_tunnel_parm *parms)
1456 {
1457         memset(parms, 0, sizeof(*parms));
1458
1459         parms->iph.protocol = IPPROTO_GRE;
1460
1461         if (!data)
1462                 return;
1463
1464         if (data[IFLA_GRE_LINK])
1465                 parms->link = nla_get_u32(data[IFLA_GRE_LINK]);
1466
1467         if (data[IFLA_GRE_IFLAGS])
1468                 parms->i_flags = nla_get_be16(data[IFLA_GRE_IFLAGS]);
1469
1470         if (data[IFLA_GRE_OFLAGS])
1471                 parms->o_flags = nla_get_be16(data[IFLA_GRE_OFLAGS]);
1472
1473         if (data[IFLA_GRE_IKEY])
1474                 parms->i_key = nla_get_be32(data[IFLA_GRE_IKEY]);
1475
1476         if (data[IFLA_GRE_OKEY])
1477                 parms->o_key = nla_get_be32(data[IFLA_GRE_OKEY]);
1478
1479         if (data[IFLA_GRE_LOCAL])
1480                 parms->iph.saddr = nla_get_be32(data[IFLA_GRE_LOCAL]);
1481
1482         if (data[IFLA_GRE_REMOTE])
1483                 parms->iph.daddr = nla_get_be32(data[IFLA_GRE_REMOTE]);
1484
1485         if (data[IFLA_GRE_TTL])
1486                 parms->iph.ttl = nla_get_u8(data[IFLA_GRE_TTL]);
1487
1488         if (data[IFLA_GRE_TOS])
1489                 parms->iph.tos = nla_get_u8(data[IFLA_GRE_TOS]);
1490
1491         if (!data[IFLA_GRE_PMTUDISC] || nla_get_u8(data[IFLA_GRE_PMTUDISC]))
1492                 parms->iph.frag_off = htons(IP_DF);
1493 }
1494
1495 static int ipgre_tap_init(struct net_device *dev)
1496 {
1497         struct ip_tunnel *tunnel;
1498
1499         tunnel = netdev_priv(dev);
1500
1501         tunnel->dev = dev;
1502         strcpy(tunnel->parms.name, dev->name);
1503
1504         ipgre_tunnel_bind_dev(dev);
1505
1506         dev->tstats = alloc_percpu(struct pcpu_tstats);
1507         if (!dev->tstats)
1508                 return -ENOMEM;
1509
1510         return 0;
1511 }
1512
1513 static const struct net_device_ops ipgre_tap_netdev_ops = {
1514         .ndo_init               = ipgre_tap_init,
1515         .ndo_uninit             = ipgre_tunnel_uninit,
1516         .ndo_start_xmit         = ipgre_tunnel_xmit,
1517         .ndo_set_mac_address    = eth_mac_addr,
1518         .ndo_validate_addr      = eth_validate_addr,
1519         .ndo_change_mtu         = ipgre_tunnel_change_mtu,
1520         .ndo_get_stats          = ipgre_get_stats,
1521 };
1522
1523 static void ipgre_tap_setup(struct net_device *dev)
1524 {
1525
1526         ether_setup(dev);
1527
1528         dev->netdev_ops         = &ipgre_tap_netdev_ops;
1529         dev->destructor         = ipgre_dev_free;
1530
1531         dev->iflink             = 0;
1532         dev->features           |= NETIF_F_NETNS_LOCAL;
1533 }
1534
1535 static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[],
1536                          struct nlattr *data[])
1537 {
1538         struct ip_tunnel *nt;
1539         struct net *net = dev_net(dev);
1540         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1541         int mtu;
1542         int err;
1543
1544         nt = netdev_priv(dev);
1545         ipgre_netlink_parms(data, &nt->parms);
1546
1547         if (ipgre_tunnel_find(net, &nt->parms, dev->type))
1548                 return -EEXIST;
1549
1550         if (dev->type == ARPHRD_ETHER && !tb[IFLA_ADDRESS])
1551                 random_ether_addr(dev->dev_addr);
1552
1553         mtu = ipgre_tunnel_bind_dev(dev);
1554         if (!tb[IFLA_MTU])
1555                 dev->mtu = mtu;
1556
1557         /* Can use a lockless transmit, unless we generate output sequences */
1558         if (!(nt->parms.o_flags & GRE_SEQ))
1559                 dev->features |= NETIF_F_LLTX;
1560
1561         err = register_netdevice(dev);
1562         if (err)
1563                 goto out;
1564
1565         dev_hold(dev);
1566         ipgre_tunnel_link(ign, nt);
1567
1568 out:
1569         return err;
1570 }
1571
1572 static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[],
1573                             struct nlattr *data[])
1574 {
1575         struct ip_tunnel *t, *nt;
1576         struct net *net = dev_net(dev);
1577         struct ipgre_net *ign = net_generic(net, ipgre_net_id);
1578         struct ip_tunnel_parm p;
1579         int mtu;
1580
1581         if (dev == ign->fb_tunnel_dev)
1582                 return -EINVAL;
1583
1584         nt = netdev_priv(dev);
1585         ipgre_netlink_parms(data, &p);
1586
1587         t = ipgre_tunnel_locate(net, &p, 0);
1588
1589         if (t) {
1590                 if (t->dev != dev)
1591                         return -EEXIST;
1592         } else {
1593                 t = nt;
1594
1595                 if (dev->type != ARPHRD_ETHER) {
1596                         unsigned int nflags = 0;
1597
1598                         if (ipv4_is_multicast(p.iph.daddr))
1599                                 nflags = IFF_BROADCAST;
1600                         else if (p.iph.daddr)
1601                                 nflags = IFF_POINTOPOINT;
1602
1603                         if ((dev->flags ^ nflags) &
1604                             (IFF_POINTOPOINT | IFF_BROADCAST))
1605                                 return -EINVAL;
1606                 }
1607
1608                 ipgre_tunnel_unlink(ign, t);
1609                 t->parms.iph.saddr = p.iph.saddr;
1610                 t->parms.iph.daddr = p.iph.daddr;
1611                 t->parms.i_key = p.i_key;
1612                 if (dev->type != ARPHRD_ETHER) {
1613                         memcpy(dev->dev_addr, &p.iph.saddr, 4);
1614                         memcpy(dev->broadcast, &p.iph.daddr, 4);
1615                 }
1616                 ipgre_tunnel_link(ign, t);
1617                 netdev_state_change(dev);
1618         }
1619
1620         t->parms.o_key = p.o_key;
1621         t->parms.iph.ttl = p.iph.ttl;
1622         t->parms.iph.tos = p.iph.tos;
1623         t->parms.iph.frag_off = p.iph.frag_off;
1624
1625         if (t->parms.link != p.link) {
1626                 t->parms.link = p.link;
1627                 mtu = ipgre_tunnel_bind_dev(dev);
1628                 if (!tb[IFLA_MTU])
1629                         dev->mtu = mtu;
1630                 netdev_state_change(dev);
1631         }
1632
1633         return 0;
1634 }
1635
1636 static size_t ipgre_get_size(const struct net_device *dev)
1637 {
1638         return
1639                 /* IFLA_GRE_LINK */
1640                 nla_total_size(4) +
1641                 /* IFLA_GRE_IFLAGS */
1642                 nla_total_size(2) +
1643                 /* IFLA_GRE_OFLAGS */
1644                 nla_total_size(2) +
1645                 /* IFLA_GRE_IKEY */
1646                 nla_total_size(4) +
1647                 /* IFLA_GRE_OKEY */
1648                 nla_total_size(4) +
1649                 /* IFLA_GRE_LOCAL */
1650                 nla_total_size(4) +
1651                 /* IFLA_GRE_REMOTE */
1652                 nla_total_size(4) +
1653                 /* IFLA_GRE_TTL */
1654                 nla_total_size(1) +
1655                 /* IFLA_GRE_TOS */
1656                 nla_total_size(1) +
1657                 /* IFLA_GRE_PMTUDISC */
1658                 nla_total_size(1) +
1659                 0;
1660 }
1661
1662 static int ipgre_fill_info(struct sk_buff *skb, const struct net_device *dev)
1663 {
1664         struct ip_tunnel *t = netdev_priv(dev);
1665         struct ip_tunnel_parm *p = &t->parms;
1666
1667         NLA_PUT_U32(skb, IFLA_GRE_LINK, p->link);
1668         NLA_PUT_BE16(skb, IFLA_GRE_IFLAGS, p->i_flags);
1669         NLA_PUT_BE16(skb, IFLA_GRE_OFLAGS, p->o_flags);
1670         NLA_PUT_BE32(skb, IFLA_GRE_IKEY, p->i_key);
1671         NLA_PUT_BE32(skb, IFLA_GRE_OKEY, p->o_key);
1672         NLA_PUT_BE32(skb, IFLA_GRE_LOCAL, p->iph.saddr);
1673         NLA_PUT_BE32(skb, IFLA_GRE_REMOTE, p->iph.daddr);
1674         NLA_PUT_U8(skb, IFLA_GRE_TTL, p->iph.ttl);
1675         NLA_PUT_U8(skb, IFLA_GRE_TOS, p->iph.tos);
1676         NLA_PUT_U8(skb, IFLA_GRE_PMTUDISC, !!(p->iph.frag_off & htons(IP_DF)));
1677
1678         return 0;
1679
1680 nla_put_failure:
1681         return -EMSGSIZE;
1682 }
1683
1684 static const struct nla_policy ipgre_policy[IFLA_GRE_MAX + 1] = {
1685         [IFLA_GRE_LINK]         = { .type = NLA_U32 },
1686         [IFLA_GRE_IFLAGS]       = { .type = NLA_U16 },
1687         [IFLA_GRE_OFLAGS]       = { .type = NLA_U16 },
1688         [IFLA_GRE_IKEY]         = { .type = NLA_U32 },
1689         [IFLA_GRE_OKEY]         = { .type = NLA_U32 },
1690         [IFLA_GRE_LOCAL]        = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1691         [IFLA_GRE_REMOTE]       = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
1692         [IFLA_GRE_TTL]          = { .type = NLA_U8 },
1693         [IFLA_GRE_TOS]          = { .type = NLA_U8 },
1694         [IFLA_GRE_PMTUDISC]     = { .type = NLA_U8 },
1695 };
1696
1697 static struct rtnl_link_ops ipgre_link_ops __read_mostly = {
1698         .kind           = "gre",
1699         .maxtype        = IFLA_GRE_MAX,
1700         .policy         = ipgre_policy,
1701         .priv_size      = sizeof(struct ip_tunnel),
1702         .setup          = ipgre_tunnel_setup,
1703         .validate       = ipgre_tunnel_validate,
1704         .newlink        = ipgre_newlink,
1705         .changelink     = ipgre_changelink,
1706         .get_size       = ipgre_get_size,
1707         .fill_info      = ipgre_fill_info,
1708 };
1709
1710 static struct rtnl_link_ops ipgre_tap_ops __read_mostly = {
1711         .kind           = "gretap",
1712         .maxtype        = IFLA_GRE_MAX,
1713         .policy         = ipgre_policy,
1714         .priv_size      = sizeof(struct ip_tunnel),
1715         .setup          = ipgre_tap_setup,
1716         .validate       = ipgre_tap_validate,
1717         .newlink        = ipgre_newlink,
1718         .changelink     = ipgre_changelink,
1719         .get_size       = ipgre_get_size,
1720         .fill_info      = ipgre_fill_info,
1721 };
1722
1723 /*
1724  *      And now the modules code and kernel interface.
1725  */
1726
1727 static int __init ipgre_init(void)
1728 {
1729         int err;
1730
1731         printk(KERN_INFO "GRE over IPv4 tunneling driver\n");
1732
1733         err = register_pernet_device(&ipgre_net_ops);
1734         if (err < 0)
1735                 return err;
1736
1737         err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
1738         if (err < 0) {
1739                 printk(KERN_INFO "ipgre init: can't add protocol\n");
1740                 goto add_proto_failed;
1741         }
1742
1743         err = rtnl_link_register(&ipgre_link_ops);
1744         if (err < 0)
1745                 goto rtnl_link_failed;
1746
1747         err = rtnl_link_register(&ipgre_tap_ops);
1748         if (err < 0)
1749                 goto tap_ops_failed;
1750
1751 out:
1752         return err;
1753
1754 tap_ops_failed:
1755         rtnl_link_unregister(&ipgre_link_ops);
1756 rtnl_link_failed:
1757         gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
1758 add_proto_failed:
1759         unregister_pernet_device(&ipgre_net_ops);
1760         goto out;
1761 }
1762
1763 static void __exit ipgre_fini(void)
1764 {
1765         rtnl_link_unregister(&ipgre_tap_ops);
1766         rtnl_link_unregister(&ipgre_link_ops);
1767         if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
1768                 printk(KERN_INFO "ipgre close: can't remove protocol\n");
1769         unregister_pernet_device(&ipgre_net_ops);
1770 }
1771
1772 module_init(ipgre_init);
1773 module_exit(ipgre_fini);
1774 MODULE_LICENSE("GPL");
1775 MODULE_ALIAS_RTNL_LINK("gre");
1776 MODULE_ALIAS_RTNL_LINK("gretap");