]> bbs.cooldavid.org Git - net-next-2.6.git/blob - net/ipv6/ip6_output.c
IPv6: Complete IPV6_DONTFRAG support
[net-next-2.6.git] / net / ipv6 / ip6_output.c
1 /*
2  *      IPv6 output functions
3  *      Linux INET6 implementation
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      Based on linux/net/ipv4/ip_output.c
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *      Changes:
16  *      A.N.Kuznetsov   :       airthmetics in fragmentation.
17  *                              extension headers are implemented.
18  *                              route changes now work.
19  *                              ip6_forward does not confuse sniffers.
20  *                              etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *      Imran Patel     :       frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *                      :       add ip6_append_data and related functions
26  *                              for datagram xmit
27  */
28
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44
45 #include <net/sock.h>
46 #include <net/snmp.h>
47
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58
59 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63         int len;
64
65         len = skb->len - sizeof(struct ipv6hdr);
66         if (len > IPV6_MAXPLEN)
67                 len = 0;
68         ipv6_hdr(skb)->payload_len = htons(len);
69
70         return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
71                        dst_output);
72 }
73
74 int ip6_local_out(struct sk_buff *skb)
75 {
76         int err;
77
78         err = __ip6_local_out(skb);
79         if (likely(err == 1))
80                 err = dst_output(skb);
81
82         return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85
86 static int ip6_output_finish(struct sk_buff *skb)
87 {
88         struct dst_entry *dst = skb_dst(skb);
89
90         if (dst->hh)
91                 return neigh_hh_output(dst->hh, skb);
92         else if (dst->neighbour)
93                 return dst->neighbour->output(skb);
94
95         IP6_INC_STATS_BH(dev_net(dst->dev),
96                          ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
97         kfree_skb(skb);
98         return -EINVAL;
99
100 }
101
102 /* dev_loopback_xmit for use with netfilter. */
103 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
104 {
105         skb_reset_mac_header(newskb);
106         __skb_pull(newskb, skb_network_offset(newskb));
107         newskb->pkt_type = PACKET_LOOPBACK;
108         newskb->ip_summed = CHECKSUM_UNNECESSARY;
109         WARN_ON(!skb_dst(newskb));
110
111         netif_rx_ni(newskb);
112         return 0;
113 }
114
115
116 static int ip6_output2(struct sk_buff *skb)
117 {
118         struct dst_entry *dst = skb_dst(skb);
119         struct net_device *dev = dst->dev;
120
121         skb->protocol = htons(ETH_P_IPV6);
122         skb->dev = dev;
123
124         if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
125                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
126
127                 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
128                     ((mroute6_socket(dev_net(dev)) &&
129                      !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
130                      ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
131                                          &ipv6_hdr(skb)->saddr))) {
132                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
133
134                         /* Do not check for IFF_ALLMULTI; multicast routing
135                            is not supported in any case.
136                          */
137                         if (newskb)
138                                 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
139                                         NULL, newskb->dev,
140                                         ip6_dev_loopback_xmit);
141
142                         if (ipv6_hdr(skb)->hop_limit == 0) {
143                                 IP6_INC_STATS(dev_net(dev), idev,
144                                               IPSTATS_MIB_OUTDISCARDS);
145                                 kfree_skb(skb);
146                                 return 0;
147                         }
148                 }
149
150                 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
151                                 skb->len);
152         }
153
154         return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
155                        ip6_output_finish);
156 }
157
158 static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
159 {
160         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
161
162         return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
163                skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
164 }
165
166 int ip6_output(struct sk_buff *skb)
167 {
168         struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
169         if (unlikely(idev->cnf.disable_ipv6)) {
170                 IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
171                               IPSTATS_MIB_OUTDISCARDS);
172                 kfree_skb(skb);
173                 return 0;
174         }
175
176         if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
177                                 dst_allfrag(skb_dst(skb)))
178                 return ip6_fragment(skb, ip6_output2);
179         else
180                 return ip6_output2(skb);
181 }
182
183 /*
184  *      xmit an sk_buff (used by TCP, SCTP and DCCP)
185  */
186
187 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
188              struct ipv6_txoptions *opt)
189 {
190         struct net *net = sock_net(sk);
191         struct ipv6_pinfo *np = inet6_sk(sk);
192         struct in6_addr *first_hop = &fl->fl6_dst;
193         struct dst_entry *dst = skb_dst(skb);
194         struct ipv6hdr *hdr;
195         u8  proto = fl->proto;
196         int seg_len = skb->len;
197         int hlimit = -1;
198         int tclass = 0;
199         u32 mtu;
200
201         if (opt) {
202                 unsigned int head_room;
203
204                 /* First: exthdrs may take lots of space (~8K for now)
205                    MAX_HEADER is not enough.
206                  */
207                 head_room = opt->opt_nflen + opt->opt_flen;
208                 seg_len += head_room;
209                 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
210
211                 if (skb_headroom(skb) < head_room) {
212                         struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
213                         if (skb2 == NULL) {
214                                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
215                                               IPSTATS_MIB_OUTDISCARDS);
216                                 kfree_skb(skb);
217                                 return -ENOBUFS;
218                         }
219                         kfree_skb(skb);
220                         skb = skb2;
221                         if (sk)
222                                 skb_set_owner_w(skb, sk);
223                 }
224                 if (opt->opt_flen)
225                         ipv6_push_frag_opts(skb, opt, &proto);
226                 if (opt->opt_nflen)
227                         ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
228         }
229
230         skb_push(skb, sizeof(struct ipv6hdr));
231         skb_reset_network_header(skb);
232         hdr = ipv6_hdr(skb);
233
234         /*
235          *      Fill in the IPv6 header
236          */
237         if (np) {
238                 tclass = np->tclass;
239                 hlimit = np->hop_limit;
240         }
241         if (hlimit < 0)
242                 hlimit = ip6_dst_hoplimit(dst);
243
244         *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
245
246         hdr->payload_len = htons(seg_len);
247         hdr->nexthdr = proto;
248         hdr->hop_limit = hlimit;
249
250         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
251         ipv6_addr_copy(&hdr->daddr, first_hop);
252
253         skb->priority = sk->sk_priority;
254         skb->mark = sk->sk_mark;
255
256         mtu = dst_mtu(dst);
257         if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
258                 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
259                               IPSTATS_MIB_OUT, skb->len);
260                 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
261                                 dst_output);
262         }
263
264         if (net_ratelimit())
265                 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
266         skb->dev = dst->dev;
267         icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
268         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
269         kfree_skb(skb);
270         return -EMSGSIZE;
271 }
272
273 EXPORT_SYMBOL(ip6_xmit);
274
275 /*
276  *      To avoid extra problems ND packets are send through this
277  *      routine. It's code duplication but I really want to avoid
278  *      extra checks since ipv6_build_header is used by TCP (which
279  *      is for us performance critical)
280  */
281
282 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
283                const struct in6_addr *saddr, const struct in6_addr *daddr,
284                int proto, int len)
285 {
286         struct ipv6_pinfo *np = inet6_sk(sk);
287         struct ipv6hdr *hdr;
288         int totlen;
289
290         skb->protocol = htons(ETH_P_IPV6);
291         skb->dev = dev;
292
293         totlen = len + sizeof(struct ipv6hdr);
294
295         skb_reset_network_header(skb);
296         skb_put(skb, sizeof(struct ipv6hdr));
297         hdr = ipv6_hdr(skb);
298
299         *(__be32*)hdr = htonl(0x60000000);
300
301         hdr->payload_len = htons(len);
302         hdr->nexthdr = proto;
303         hdr->hop_limit = np->hop_limit;
304
305         ipv6_addr_copy(&hdr->saddr, saddr);
306         ipv6_addr_copy(&hdr->daddr, daddr);
307
308         return 0;
309 }
310
311 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
312 {
313         struct ip6_ra_chain *ra;
314         struct sock *last = NULL;
315
316         read_lock(&ip6_ra_lock);
317         for (ra = ip6_ra_chain; ra; ra = ra->next) {
318                 struct sock *sk = ra->sk;
319                 if (sk && ra->sel == sel &&
320                     (!sk->sk_bound_dev_if ||
321                      sk->sk_bound_dev_if == skb->dev->ifindex)) {
322                         if (last) {
323                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
324                                 if (skb2)
325                                         rawv6_rcv(last, skb2);
326                         }
327                         last = sk;
328                 }
329         }
330
331         if (last) {
332                 rawv6_rcv(last, skb);
333                 read_unlock(&ip6_ra_lock);
334                 return 1;
335         }
336         read_unlock(&ip6_ra_lock);
337         return 0;
338 }
339
340 static int ip6_forward_proxy_check(struct sk_buff *skb)
341 {
342         struct ipv6hdr *hdr = ipv6_hdr(skb);
343         u8 nexthdr = hdr->nexthdr;
344         int offset;
345
346         if (ipv6_ext_hdr(nexthdr)) {
347                 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
348                 if (offset < 0)
349                         return 0;
350         } else
351                 offset = sizeof(struct ipv6hdr);
352
353         if (nexthdr == IPPROTO_ICMPV6) {
354                 struct icmp6hdr *icmp6;
355
356                 if (!pskb_may_pull(skb, (skb_network_header(skb) +
357                                          offset + 1 - skb->data)))
358                         return 0;
359
360                 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
361
362                 switch (icmp6->icmp6_type) {
363                 case NDISC_ROUTER_SOLICITATION:
364                 case NDISC_ROUTER_ADVERTISEMENT:
365                 case NDISC_NEIGHBOUR_SOLICITATION:
366                 case NDISC_NEIGHBOUR_ADVERTISEMENT:
367                 case NDISC_REDIRECT:
368                         /* For reaction involving unicast neighbor discovery
369                          * message destined to the proxied address, pass it to
370                          * input function.
371                          */
372                         return 1;
373                 default:
374                         break;
375                 }
376         }
377
378         /*
379          * The proxying router can't forward traffic sent to a link-local
380          * address, so signal the sender and discard the packet. This
381          * behavior is clarified by the MIPv6 specification.
382          */
383         if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
384                 dst_link_failure(skb);
385                 return -1;
386         }
387
388         return 0;
389 }
390
391 static inline int ip6_forward_finish(struct sk_buff *skb)
392 {
393         return dst_output(skb);
394 }
395
396 int ip6_forward(struct sk_buff *skb)
397 {
398         struct dst_entry *dst = skb_dst(skb);
399         struct ipv6hdr *hdr = ipv6_hdr(skb);
400         struct inet6_skb_parm *opt = IP6CB(skb);
401         struct net *net = dev_net(dst->dev);
402         u32 mtu;
403
404         if (net->ipv6.devconf_all->forwarding == 0)
405                 goto error;
406
407         if (skb_warn_if_lro(skb))
408                 goto drop;
409
410         if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
411                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
412                 goto drop;
413         }
414
415         skb_forward_csum(skb);
416
417         /*
418          *      We DO NOT make any processing on
419          *      RA packets, pushing them to user level AS IS
420          *      without ane WARRANTY that application will be able
421          *      to interpret them. The reason is that we
422          *      cannot make anything clever here.
423          *
424          *      We are not end-node, so that if packet contains
425          *      AH/ESP, we cannot make anything.
426          *      Defragmentation also would be mistake, RA packets
427          *      cannot be fragmented, because there is no warranty
428          *      that different fragments will go along one path. --ANK
429          */
430         if (opt->ra) {
431                 u8 *ptr = skb_network_header(skb) + opt->ra;
432                 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
433                         return 0;
434         }
435
436         /*
437          *      check and decrement ttl
438          */
439         if (hdr->hop_limit <= 1) {
440                 /* Force OUTPUT device used as source address */
441                 skb->dev = dst->dev;
442                 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
443                 IP6_INC_STATS_BH(net,
444                                  ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
445
446                 kfree_skb(skb);
447                 return -ETIMEDOUT;
448         }
449
450         /* XXX: idev->cnf.proxy_ndp? */
451         if (net->ipv6.devconf_all->proxy_ndp &&
452             pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
453                 int proxied = ip6_forward_proxy_check(skb);
454                 if (proxied > 0)
455                         return ip6_input(skb);
456                 else if (proxied < 0) {
457                         IP6_INC_STATS(net, ip6_dst_idev(dst),
458                                       IPSTATS_MIB_INDISCARDS);
459                         goto drop;
460                 }
461         }
462
463         if (!xfrm6_route_forward(skb)) {
464                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
465                 goto drop;
466         }
467         dst = skb_dst(skb);
468
469         /* IPv6 specs say nothing about it, but it is clear that we cannot
470            send redirects to source routed frames.
471            We don't send redirects to frames decapsulated from IPsec.
472          */
473         if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
474             !skb_sec_path(skb)) {
475                 struct in6_addr *target = NULL;
476                 struct rt6_info *rt;
477                 struct neighbour *n = dst->neighbour;
478
479                 /*
480                  *      incoming and outgoing devices are the same
481                  *      send a redirect.
482                  */
483
484                 rt = (struct rt6_info *) dst;
485                 if ((rt->rt6i_flags & RTF_GATEWAY))
486                         target = (struct in6_addr*)&n->primary_key;
487                 else
488                         target = &hdr->daddr;
489
490                 /* Limit redirects both by destination (here)
491                    and by source (inside ndisc_send_redirect)
492                  */
493                 if (xrlim_allow(dst, 1*HZ))
494                         ndisc_send_redirect(skb, n, target);
495         } else {
496                 int addrtype = ipv6_addr_type(&hdr->saddr);
497
498                 /* This check is security critical. */
499                 if (addrtype == IPV6_ADDR_ANY ||
500                     addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
501                         goto error;
502                 if (addrtype & IPV6_ADDR_LINKLOCAL) {
503                         icmpv6_send(skb, ICMPV6_DEST_UNREACH,
504                                     ICMPV6_NOT_NEIGHBOUR, 0);
505                         goto error;
506                 }
507         }
508
509         mtu = dst_mtu(dst);
510         if (mtu < IPV6_MIN_MTU)
511                 mtu = IPV6_MIN_MTU;
512
513         if (skb->len > mtu) {
514                 /* Again, force OUTPUT device used as source address */
515                 skb->dev = dst->dev;
516                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
517                 IP6_INC_STATS_BH(net,
518                                  ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
519                 IP6_INC_STATS_BH(net,
520                                  ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
521                 kfree_skb(skb);
522                 return -EMSGSIZE;
523         }
524
525         if (skb_cow(skb, dst->dev->hard_header_len)) {
526                 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
527                 goto drop;
528         }
529
530         hdr = ipv6_hdr(skb);
531
532         /* Mangling hops number delayed to point after skb COW */
533
534         hdr->hop_limit--;
535
536         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
537         return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
538                        ip6_forward_finish);
539
540 error:
541         IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
542 drop:
543         kfree_skb(skb);
544         return -EINVAL;
545 }
546
547 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
548 {
549         to->pkt_type = from->pkt_type;
550         to->priority = from->priority;
551         to->protocol = from->protocol;
552         skb_dst_drop(to);
553         skb_dst_set(to, dst_clone(skb_dst(from)));
554         to->dev = from->dev;
555         to->mark = from->mark;
556
557 #ifdef CONFIG_NET_SCHED
558         to->tc_index = from->tc_index;
559 #endif
560         nf_copy(to, from);
561 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
562     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
563         to->nf_trace = from->nf_trace;
564 #endif
565         skb_copy_secmark(to, from);
566 }
567
568 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
569 {
570         u16 offset = sizeof(struct ipv6hdr);
571         struct ipv6_opt_hdr *exthdr =
572                                 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
573         unsigned int packet_len = skb->tail - skb->network_header;
574         int found_rhdr = 0;
575         *nexthdr = &ipv6_hdr(skb)->nexthdr;
576
577         while (offset + 1 <= packet_len) {
578
579                 switch (**nexthdr) {
580
581                 case NEXTHDR_HOP:
582                         break;
583                 case NEXTHDR_ROUTING:
584                         found_rhdr = 1;
585                         break;
586                 case NEXTHDR_DEST:
587 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
588                         if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
589                                 break;
590 #endif
591                         if (found_rhdr)
592                                 return offset;
593                         break;
594                 default :
595                         return offset;
596                 }
597
598                 offset += ipv6_optlen(exthdr);
599                 *nexthdr = &exthdr->nexthdr;
600                 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
601                                                  offset);
602         }
603
604         return offset;
605 }
606
607 static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
608 {
609         struct sk_buff *frag;
610         struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
611         struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
612         struct ipv6hdr *tmp_hdr;
613         struct frag_hdr *fh;
614         unsigned int mtu, hlen, left, len;
615         __be32 frag_id = 0;
616         int ptr, offset = 0, err=0;
617         u8 *prevhdr, nexthdr = 0;
618         struct net *net = dev_net(skb_dst(skb)->dev);
619
620         hlen = ip6_find_1stfragopt(skb, &prevhdr);
621         nexthdr = *prevhdr;
622
623         mtu = ip6_skb_dst_mtu(skb);
624
625         /* We must not fragment if the socket is set to force MTU discovery
626          * or if the skb it not generated by a local socket.
627          */
628         if (!skb->local_df) {
629                 skb->dev = skb_dst(skb)->dev;
630                 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
631                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
632                               IPSTATS_MIB_FRAGFAILS);
633                 kfree_skb(skb);
634                 return -EMSGSIZE;
635         }
636
637         if (np && np->frag_size < mtu) {
638                 if (np->frag_size)
639                         mtu = np->frag_size;
640         }
641         mtu -= hlen + sizeof(struct frag_hdr);
642
643         if (skb_has_frags(skb)) {
644                 int first_len = skb_pagelen(skb);
645                 int truesizes = 0;
646
647                 if (first_len - hlen > mtu ||
648                     ((first_len - hlen) & 7) ||
649                     skb_cloned(skb))
650                         goto slow_path;
651
652                 skb_walk_frags(skb, frag) {
653                         /* Correct geometry. */
654                         if (frag->len > mtu ||
655                             ((frag->len & 7) && frag->next) ||
656                             skb_headroom(frag) < hlen)
657                             goto slow_path;
658
659                         /* Partially cloned skb? */
660                         if (skb_shared(frag))
661                                 goto slow_path;
662
663                         BUG_ON(frag->sk);
664                         if (skb->sk) {
665                                 frag->sk = skb->sk;
666                                 frag->destructor = sock_wfree;
667                                 truesizes += frag->truesize;
668                         }
669                 }
670
671                 err = 0;
672                 offset = 0;
673                 frag = skb_shinfo(skb)->frag_list;
674                 skb_frag_list_init(skb);
675                 /* BUILD HEADER */
676
677                 *prevhdr = NEXTHDR_FRAGMENT;
678                 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
679                 if (!tmp_hdr) {
680                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
681                                       IPSTATS_MIB_FRAGFAILS);
682                         return -ENOMEM;
683                 }
684
685                 __skb_pull(skb, hlen);
686                 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
687                 __skb_push(skb, hlen);
688                 skb_reset_network_header(skb);
689                 memcpy(skb_network_header(skb), tmp_hdr, hlen);
690
691                 ipv6_select_ident(fh);
692                 fh->nexthdr = nexthdr;
693                 fh->reserved = 0;
694                 fh->frag_off = htons(IP6_MF);
695                 frag_id = fh->identification;
696
697                 first_len = skb_pagelen(skb);
698                 skb->data_len = first_len - skb_headlen(skb);
699                 skb->truesize -= truesizes;
700                 skb->len = first_len;
701                 ipv6_hdr(skb)->payload_len = htons(first_len -
702                                                    sizeof(struct ipv6hdr));
703
704                 dst_hold(&rt->u.dst);
705
706                 for (;;) {
707                         /* Prepare header of the next frame,
708                          * before previous one went down. */
709                         if (frag) {
710                                 frag->ip_summed = CHECKSUM_NONE;
711                                 skb_reset_transport_header(frag);
712                                 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
713                                 __skb_push(frag, hlen);
714                                 skb_reset_network_header(frag);
715                                 memcpy(skb_network_header(frag), tmp_hdr,
716                                        hlen);
717                                 offset += skb->len - hlen - sizeof(struct frag_hdr);
718                                 fh->nexthdr = nexthdr;
719                                 fh->reserved = 0;
720                                 fh->frag_off = htons(offset);
721                                 if (frag->next != NULL)
722                                         fh->frag_off |= htons(IP6_MF);
723                                 fh->identification = frag_id;
724                                 ipv6_hdr(frag)->payload_len =
725                                                 htons(frag->len -
726                                                       sizeof(struct ipv6hdr));
727                                 ip6_copy_metadata(frag, skb);
728                         }
729
730                         err = output(skb);
731                         if(!err)
732                                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
733                                               IPSTATS_MIB_FRAGCREATES);
734
735                         if (err || !frag)
736                                 break;
737
738                         skb = frag;
739                         frag = skb->next;
740                         skb->next = NULL;
741                 }
742
743                 kfree(tmp_hdr);
744
745                 if (err == 0) {
746                         IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
747                                       IPSTATS_MIB_FRAGOKS);
748                         dst_release(&rt->u.dst);
749                         return 0;
750                 }
751
752                 while (frag) {
753                         skb = frag->next;
754                         kfree_skb(frag);
755                         frag = skb;
756                 }
757
758                 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
759                               IPSTATS_MIB_FRAGFAILS);
760                 dst_release(&rt->u.dst);
761                 return err;
762         }
763
764 slow_path:
765         left = skb->len - hlen;         /* Space per frame */
766         ptr = hlen;                     /* Where to start from */
767
768         /*
769          *      Fragment the datagram.
770          */
771
772         *prevhdr = NEXTHDR_FRAGMENT;
773
774         /*
775          *      Keep copying data until we run out.
776          */
777         while(left > 0) {
778                 len = left;
779                 /* IF: it doesn't fit, use 'mtu' - the data space left */
780                 if (len > mtu)
781                         len = mtu;
782                 /* IF: we are not sending upto and including the packet end
783                    then align the next start on an eight byte boundary */
784                 if (len < left) {
785                         len &= ~7;
786                 }
787                 /*
788                  *      Allocate buffer.
789                  */
790
791                 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
792                         NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
793                         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
794                                       IPSTATS_MIB_FRAGFAILS);
795                         err = -ENOMEM;
796                         goto fail;
797                 }
798
799                 /*
800                  *      Set up data on packet
801                  */
802
803                 ip6_copy_metadata(frag, skb);
804                 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
805                 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
806                 skb_reset_network_header(frag);
807                 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
808                 frag->transport_header = (frag->network_header + hlen +
809                                           sizeof(struct frag_hdr));
810
811                 /*
812                  *      Charge the memory for the fragment to any owner
813                  *      it might possess
814                  */
815                 if (skb->sk)
816                         skb_set_owner_w(frag, skb->sk);
817
818                 /*
819                  *      Copy the packet header into the new buffer.
820                  */
821                 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
822
823                 /*
824                  *      Build fragment header.
825                  */
826                 fh->nexthdr = nexthdr;
827                 fh->reserved = 0;
828                 if (!frag_id) {
829                         ipv6_select_ident(fh);
830                         frag_id = fh->identification;
831                 } else
832                         fh->identification = frag_id;
833
834                 /*
835                  *      Copy a block of the IP datagram.
836                  */
837                 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
838                         BUG();
839                 left -= len;
840
841                 fh->frag_off = htons(offset);
842                 if (left > 0)
843                         fh->frag_off |= htons(IP6_MF);
844                 ipv6_hdr(frag)->payload_len = htons(frag->len -
845                                                     sizeof(struct ipv6hdr));
846
847                 ptr += len;
848                 offset += len;
849
850                 /*
851                  *      Put this fragment into the sending queue.
852                  */
853                 err = output(frag);
854                 if (err)
855                         goto fail;
856
857                 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
858                               IPSTATS_MIB_FRAGCREATES);
859         }
860         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
861                       IPSTATS_MIB_FRAGOKS);
862         kfree_skb(skb);
863         return err;
864
865 fail:
866         IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
867                       IPSTATS_MIB_FRAGFAILS);
868         kfree_skb(skb);
869         return err;
870 }
871
872 static inline int ip6_rt_check(struct rt6key *rt_key,
873                                struct in6_addr *fl_addr,
874                                struct in6_addr *addr_cache)
875 {
876         return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
877                 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
878 }
879
880 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
881                                           struct dst_entry *dst,
882                                           struct flowi *fl)
883 {
884         struct ipv6_pinfo *np = inet6_sk(sk);
885         struct rt6_info *rt = (struct rt6_info *)dst;
886
887         if (!dst)
888                 goto out;
889
890         /* Yes, checking route validity in not connected
891          * case is not very simple. Take into account,
892          * that we do not support routing by source, TOS,
893          * and MSG_DONTROUTE            --ANK (980726)
894          *
895          * 1. ip6_rt_check(): If route was host route,
896          *    check that cached destination is current.
897          *    If it is network route, we still may
898          *    check its validity using saved pointer
899          *    to the last used address: daddr_cache.
900          *    We do not want to save whole address now,
901          *    (because main consumer of this service
902          *    is tcp, which has not this problem),
903          *    so that the last trick works only on connected
904          *    sockets.
905          * 2. oif also should be the same.
906          */
907         if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
908 #ifdef CONFIG_IPV6_SUBTREES
909             ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
910 #endif
911             (fl->oif && fl->oif != dst->dev->ifindex)) {
912                 dst_release(dst);
913                 dst = NULL;
914         }
915
916 out:
917         return dst;
918 }
919
920 static int ip6_dst_lookup_tail(struct sock *sk,
921                                struct dst_entry **dst, struct flowi *fl)
922 {
923         int err;
924         struct net *net = sock_net(sk);
925
926         if (*dst == NULL)
927                 *dst = ip6_route_output(net, sk, fl);
928
929         if ((err = (*dst)->error))
930                 goto out_err_release;
931
932         if (ipv6_addr_any(&fl->fl6_src)) {
933                 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
934                                          &fl->fl6_dst,
935                                          sk ? inet6_sk(sk)->srcprefs : 0,
936                                          &fl->fl6_src);
937                 if (err)
938                         goto out_err_release;
939         }
940
941 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
942         /*
943          * Here if the dst entry we've looked up
944          * has a neighbour entry that is in the INCOMPLETE
945          * state and the src address from the flow is
946          * marked as OPTIMISTIC, we release the found
947          * dst entry and replace it instead with the
948          * dst entry of the nexthop router
949          */
950         if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
951                 struct inet6_ifaddr *ifp;
952                 struct flowi fl_gw;
953                 int redirect;
954
955                 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
956                                       (*dst)->dev, 1);
957
958                 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
959                 if (ifp)
960                         in6_ifa_put(ifp);
961
962                 if (redirect) {
963                         /*
964                          * We need to get the dst entry for the
965                          * default router instead
966                          */
967                         dst_release(*dst);
968                         memcpy(&fl_gw, fl, sizeof(struct flowi));
969                         memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
970                         *dst = ip6_route_output(net, sk, &fl_gw);
971                         if ((err = (*dst)->error))
972                                 goto out_err_release;
973                 }
974         }
975 #endif
976
977         return 0;
978
979 out_err_release:
980         if (err == -ENETUNREACH)
981                 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
982         dst_release(*dst);
983         *dst = NULL;
984         return err;
985 }
986
987 /**
988  *      ip6_dst_lookup - perform route lookup on flow
989  *      @sk: socket which provides route info
990  *      @dst: pointer to dst_entry * for result
991  *      @fl: flow to lookup
992  *
993  *      This function performs a route lookup on the given flow.
994  *
995  *      It returns zero on success, or a standard errno code on error.
996  */
997 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
998 {
999         *dst = NULL;
1000         return ip6_dst_lookup_tail(sk, dst, fl);
1001 }
1002 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003
1004 /**
1005  *      ip6_sk_dst_lookup - perform socket cached route lookup on flow
1006  *      @sk: socket which provides the dst cache and route info
1007  *      @dst: pointer to dst_entry * for result
1008  *      @fl: flow to lookup
1009  *
1010  *      This function performs a route lookup on the given flow with the
1011  *      possibility of using the cached route in the socket if it is valid.
1012  *      It will take the socket dst lock when operating on the dst cache.
1013  *      As a result, this function can only be used in process context.
1014  *
1015  *      It returns zero on success, or a standard errno code on error.
1016  */
1017 int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1018 {
1019         *dst = NULL;
1020         if (sk) {
1021                 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1022                 *dst = ip6_sk_dst_check(sk, *dst, fl);
1023         }
1024
1025         return ip6_dst_lookup_tail(sk, dst, fl);
1026 }
1027 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1028
1029 static inline int ip6_ufo_append_data(struct sock *sk,
1030                         int getfrag(void *from, char *to, int offset, int len,
1031                         int odd, struct sk_buff *skb),
1032                         void *from, int length, int hh_len, int fragheaderlen,
1033                         int transhdrlen, int mtu,unsigned int flags)
1034
1035 {
1036         struct sk_buff *skb;
1037         int err;
1038
1039         /* There is support for UDP large send offload by network
1040          * device, so create one single skb packet containing complete
1041          * udp datagram
1042          */
1043         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1044                 skb = sock_alloc_send_skb(sk,
1045                         hh_len + fragheaderlen + transhdrlen + 20,
1046                         (flags & MSG_DONTWAIT), &err);
1047                 if (skb == NULL)
1048                         return -ENOMEM;
1049
1050                 /* reserve space for Hardware header */
1051                 skb_reserve(skb, hh_len);
1052
1053                 /* create space for UDP/IP header */
1054                 skb_put(skb,fragheaderlen + transhdrlen);
1055
1056                 /* initialize network header pointer */
1057                 skb_reset_network_header(skb);
1058
1059                 /* initialize protocol header pointer */
1060                 skb->transport_header = skb->network_header + fragheaderlen;
1061
1062                 skb->ip_summed = CHECKSUM_PARTIAL;
1063                 skb->csum = 0;
1064                 sk->sk_sndmsg_off = 0;
1065         }
1066
1067         err = skb_append_datato_frags(sk,skb, getfrag, from,
1068                                       (length - transhdrlen));
1069         if (!err) {
1070                 struct frag_hdr fhdr;
1071
1072                 /* Specify the length of each IPv6 datagram fragment.
1073                  * It has to be a multiple of 8.
1074                  */
1075                 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1076                                              sizeof(struct frag_hdr)) & ~7;
1077                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1078                 ipv6_select_ident(&fhdr);
1079                 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1080                 __skb_queue_tail(&sk->sk_write_queue, skb);
1081
1082                 return 0;
1083         }
1084         /* There is not enough support do UPD LSO,
1085          * so follow normal path
1086          */
1087         kfree_skb(skb);
1088
1089         return err;
1090 }
1091
1092 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1093                                                gfp_t gfp)
1094 {
1095         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1096 }
1097
1098 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1099                                                 gfp_t gfp)
1100 {
1101         return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1102 }
1103
1104 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1105         int offset, int len, int odd, struct sk_buff *skb),
1106         void *from, int length, int transhdrlen,
1107         int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1108         struct rt6_info *rt, unsigned int flags, int dontfrag)
1109 {
1110         struct inet_sock *inet = inet_sk(sk);
1111         struct ipv6_pinfo *np = inet6_sk(sk);
1112         struct sk_buff *skb;
1113         unsigned int maxfraglen, fragheaderlen;
1114         int exthdrlen;
1115         int hh_len;
1116         int mtu;
1117         int copy;
1118         int err;
1119         int offset = 0;
1120         int csummode = CHECKSUM_NONE;
1121
1122         if (flags&MSG_PROBE)
1123                 return 0;
1124         if (skb_queue_empty(&sk->sk_write_queue)) {
1125                 /*
1126                  * setup for corking
1127                  */
1128                 if (opt) {
1129                         if (WARN_ON(np->cork.opt))
1130                                 return -EINVAL;
1131
1132                         np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1133                         if (unlikely(np->cork.opt == NULL))
1134                                 return -ENOBUFS;
1135
1136                         np->cork.opt->tot_len = opt->tot_len;
1137                         np->cork.opt->opt_flen = opt->opt_flen;
1138                         np->cork.opt->opt_nflen = opt->opt_nflen;
1139
1140                         np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1141                                                             sk->sk_allocation);
1142                         if (opt->dst0opt && !np->cork.opt->dst0opt)
1143                                 return -ENOBUFS;
1144
1145                         np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1146                                                             sk->sk_allocation);
1147                         if (opt->dst1opt && !np->cork.opt->dst1opt)
1148                                 return -ENOBUFS;
1149
1150                         np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1151                                                            sk->sk_allocation);
1152                         if (opt->hopopt && !np->cork.opt->hopopt)
1153                                 return -ENOBUFS;
1154
1155                         np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1156                                                             sk->sk_allocation);
1157                         if (opt->srcrt && !np->cork.opt->srcrt)
1158                                 return -ENOBUFS;
1159
1160                         /* need source address above miyazawa*/
1161                 }
1162                 dst_hold(&rt->u.dst);
1163                 inet->cork.dst = &rt->u.dst;
1164                 inet->cork.fl = *fl;
1165                 np->cork.hop_limit = hlimit;
1166                 np->cork.tclass = tclass;
1167                 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1168                       rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
1169                 if (np->frag_size < mtu) {
1170                         if (np->frag_size)
1171                                 mtu = np->frag_size;
1172                 }
1173                 inet->cork.fragsize = mtu;
1174                 if (dst_allfrag(rt->u.dst.path))
1175                         inet->cork.flags |= IPCORK_ALLFRAG;
1176                 inet->cork.length = 0;
1177                 sk->sk_sndmsg_page = NULL;
1178                 sk->sk_sndmsg_off = 0;
1179                 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
1180                             rt->rt6i_nfheader_len;
1181                 length += exthdrlen;
1182                 transhdrlen += exthdrlen;
1183         } else {
1184                 rt = (struct rt6_info *)inet->cork.dst;
1185                 fl = &inet->cork.fl;
1186                 opt = np->cork.opt;
1187                 transhdrlen = 0;
1188                 exthdrlen = 0;
1189                 mtu = inet->cork.fragsize;
1190         }
1191
1192         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1193
1194         fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1195                         (opt ? opt->opt_nflen : 0);
1196         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1197
1198         if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1199                 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1200                         ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1201                         return -EMSGSIZE;
1202                 }
1203         }
1204
1205         /*
1206          * Let's try using as much space as possible.
1207          * Use MTU if total length of the message fits into the MTU.
1208          * Otherwise, we need to reserve fragment header and
1209          * fragment alignment (= 8-15 octects, in total).
1210          *
1211          * Note that we may need to "move" the data from the tail of
1212          * of the buffer to the new fragment when we split
1213          * the message.
1214          *
1215          * FIXME: It may be fragmented into multiple chunks
1216          *        at once if non-fragmentable extension headers
1217          *        are too large.
1218          * --yoshfuji
1219          */
1220
1221         inet->cork.length += length;
1222         if (length > mtu) {
1223                 int proto = sk->sk_protocol;
1224                 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1225                         ipv6_local_rxpmtu(sk, fl, mtu-exthdrlen);
1226                         return -EMSGSIZE;
1227                 }
1228
1229                 if (proto == IPPROTO_UDP &&
1230                     (rt->u.dst.dev->features & NETIF_F_UFO)) {
1231
1232                         err = ip6_ufo_append_data(sk, getfrag, from, length,
1233                                                   hh_len, fragheaderlen,
1234                                                   transhdrlen, mtu, flags);
1235                         if (err)
1236                                 goto error;
1237                         return 0;
1238                 }
1239         }
1240
1241         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1242                 goto alloc_new_skb;
1243
1244         while (length > 0) {
1245                 /* Check if the remaining data fits into current packet. */
1246                 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1247                 if (copy < length)
1248                         copy = maxfraglen - skb->len;
1249
1250                 if (copy <= 0) {
1251                         char *data;
1252                         unsigned int datalen;
1253                         unsigned int fraglen;
1254                         unsigned int fraggap;
1255                         unsigned int alloclen;
1256                         struct sk_buff *skb_prev;
1257 alloc_new_skb:
1258                         skb_prev = skb;
1259
1260                         /* There's no room in the current skb */
1261                         if (skb_prev)
1262                                 fraggap = skb_prev->len - maxfraglen;
1263                         else
1264                                 fraggap = 0;
1265
1266                         /*
1267                          * If remaining data exceeds the mtu,
1268                          * we know we need more fragment(s).
1269                          */
1270                         datalen = length + fraggap;
1271                         if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1272                                 datalen = maxfraglen - fragheaderlen;
1273
1274                         fraglen = datalen + fragheaderlen;
1275                         if ((flags & MSG_MORE) &&
1276                             !(rt->u.dst.dev->features&NETIF_F_SG))
1277                                 alloclen = mtu;
1278                         else
1279                                 alloclen = datalen + fragheaderlen;
1280
1281                         /*
1282                          * The last fragment gets additional space at tail.
1283                          * Note: we overallocate on fragments with MSG_MODE
1284                          * because we have no idea if we're the last one.
1285                          */
1286                         if (datalen == length + fraggap)
1287                                 alloclen += rt->u.dst.trailer_len;
1288
1289                         /*
1290                          * We just reserve space for fragment header.
1291                          * Note: this may be overallocation if the message
1292                          * (without MSG_MORE) fits into the MTU.
1293                          */
1294                         alloclen += sizeof(struct frag_hdr);
1295
1296                         if (transhdrlen) {
1297                                 skb = sock_alloc_send_skb(sk,
1298                                                 alloclen + hh_len,
1299                                                 (flags & MSG_DONTWAIT), &err);
1300                         } else {
1301                                 skb = NULL;
1302                                 if (atomic_read(&sk->sk_wmem_alloc) <=
1303                                     2 * sk->sk_sndbuf)
1304                                         skb = sock_wmalloc(sk,
1305                                                            alloclen + hh_len, 1,
1306                                                            sk->sk_allocation);
1307                                 if (unlikely(skb == NULL))
1308                                         err = -ENOBUFS;
1309                         }
1310                         if (skb == NULL)
1311                                 goto error;
1312                         /*
1313                          *      Fill in the control structures
1314                          */
1315                         skb->ip_summed = csummode;
1316                         skb->csum = 0;
1317                         /* reserve for fragmentation */
1318                         skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1319
1320                         /*
1321                          *      Find where to start putting bytes
1322                          */
1323                         data = skb_put(skb, fraglen);
1324                         skb_set_network_header(skb, exthdrlen);
1325                         data += fragheaderlen;
1326                         skb->transport_header = (skb->network_header +
1327                                                  fragheaderlen);
1328                         if (fraggap) {
1329                                 skb->csum = skb_copy_and_csum_bits(
1330                                         skb_prev, maxfraglen,
1331                                         data + transhdrlen, fraggap, 0);
1332                                 skb_prev->csum = csum_sub(skb_prev->csum,
1333                                                           skb->csum);
1334                                 data += fraggap;
1335                                 pskb_trim_unique(skb_prev, maxfraglen);
1336                         }
1337                         copy = datalen - transhdrlen - fraggap;
1338                         if (copy < 0) {
1339                                 err = -EINVAL;
1340                                 kfree_skb(skb);
1341                                 goto error;
1342                         } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1343                                 err = -EFAULT;
1344                                 kfree_skb(skb);
1345                                 goto error;
1346                         }
1347
1348                         offset += copy;
1349                         length -= datalen - fraggap;
1350                         transhdrlen = 0;
1351                         exthdrlen = 0;
1352                         csummode = CHECKSUM_NONE;
1353
1354                         /*
1355                          * Put the packet on the pending queue
1356                          */
1357                         __skb_queue_tail(&sk->sk_write_queue, skb);
1358                         continue;
1359                 }
1360
1361                 if (copy > length)
1362                         copy = length;
1363
1364                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1365                         unsigned int off;
1366
1367                         off = skb->len;
1368                         if (getfrag(from, skb_put(skb, copy),
1369                                                 offset, copy, off, skb) < 0) {
1370                                 __skb_trim(skb, off);
1371                                 err = -EFAULT;
1372                                 goto error;
1373                         }
1374                 } else {
1375                         int i = skb_shinfo(skb)->nr_frags;
1376                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1377                         struct page *page = sk->sk_sndmsg_page;
1378                         int off = sk->sk_sndmsg_off;
1379                         unsigned int left;
1380
1381                         if (page && (left = PAGE_SIZE - off) > 0) {
1382                                 if (copy >= left)
1383                                         copy = left;
1384                                 if (page != frag->page) {
1385                                         if (i == MAX_SKB_FRAGS) {
1386                                                 err = -EMSGSIZE;
1387                                                 goto error;
1388                                         }
1389                                         get_page(page);
1390                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1391                                         frag = &skb_shinfo(skb)->frags[i];
1392                                 }
1393                         } else if(i < MAX_SKB_FRAGS) {
1394                                 if (copy > PAGE_SIZE)
1395                                         copy = PAGE_SIZE;
1396                                 page = alloc_pages(sk->sk_allocation, 0);
1397                                 if (page == NULL) {
1398                                         err = -ENOMEM;
1399                                         goto error;
1400                                 }
1401                                 sk->sk_sndmsg_page = page;
1402                                 sk->sk_sndmsg_off = 0;
1403
1404                                 skb_fill_page_desc(skb, i, page, 0, 0);
1405                                 frag = &skb_shinfo(skb)->frags[i];
1406                         } else {
1407                                 err = -EMSGSIZE;
1408                                 goto error;
1409                         }
1410                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1411                                 err = -EFAULT;
1412                                 goto error;
1413                         }
1414                         sk->sk_sndmsg_off += copy;
1415                         frag->size += copy;
1416                         skb->len += copy;
1417                         skb->data_len += copy;
1418                         skb->truesize += copy;
1419                         atomic_add(copy, &sk->sk_wmem_alloc);
1420                 }
1421                 offset += copy;
1422                 length -= copy;
1423         }
1424         return 0;
1425 error:
1426         inet->cork.length -= length;
1427         IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1428         return err;
1429 }
1430
1431 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1432 {
1433         if (np->cork.opt) {
1434                 kfree(np->cork.opt->dst0opt);
1435                 kfree(np->cork.opt->dst1opt);
1436                 kfree(np->cork.opt->hopopt);
1437                 kfree(np->cork.opt->srcrt);
1438                 kfree(np->cork.opt);
1439                 np->cork.opt = NULL;
1440         }
1441
1442         if (inet->cork.dst) {
1443                 dst_release(inet->cork.dst);
1444                 inet->cork.dst = NULL;
1445                 inet->cork.flags &= ~IPCORK_ALLFRAG;
1446         }
1447         memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1448 }
1449
1450 int ip6_push_pending_frames(struct sock *sk)
1451 {
1452         struct sk_buff *skb, *tmp_skb;
1453         struct sk_buff **tail_skb;
1454         struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1455         struct inet_sock *inet = inet_sk(sk);
1456         struct ipv6_pinfo *np = inet6_sk(sk);
1457         struct net *net = sock_net(sk);
1458         struct ipv6hdr *hdr;
1459         struct ipv6_txoptions *opt = np->cork.opt;
1460         struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1461         struct flowi *fl = &inet->cork.fl;
1462         unsigned char proto = fl->proto;
1463         int err = 0;
1464
1465         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1466                 goto out;
1467         tail_skb = &(skb_shinfo(skb)->frag_list);
1468
1469         /* move skb->data to ip header from ext header */
1470         if (skb->data < skb_network_header(skb))
1471                 __skb_pull(skb, skb_network_offset(skb));
1472         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1473                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1474                 *tail_skb = tmp_skb;
1475                 tail_skb = &(tmp_skb->next);
1476                 skb->len += tmp_skb->len;
1477                 skb->data_len += tmp_skb->len;
1478                 skb->truesize += tmp_skb->truesize;
1479                 tmp_skb->destructor = NULL;
1480                 tmp_skb->sk = NULL;
1481         }
1482
1483         /* Allow local fragmentation. */
1484         if (np->pmtudisc < IPV6_PMTUDISC_DO)
1485                 skb->local_df = 1;
1486
1487         ipv6_addr_copy(final_dst, &fl->fl6_dst);
1488         __skb_pull(skb, skb_network_header_len(skb));
1489         if (opt && opt->opt_flen)
1490                 ipv6_push_frag_opts(skb, opt, &proto);
1491         if (opt && opt->opt_nflen)
1492                 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1493
1494         skb_push(skb, sizeof(struct ipv6hdr));
1495         skb_reset_network_header(skb);
1496         hdr = ipv6_hdr(skb);
1497
1498         *(__be32*)hdr = fl->fl6_flowlabel |
1499                      htonl(0x60000000 | ((int)np->cork.tclass << 20));
1500
1501         hdr->hop_limit = np->cork.hop_limit;
1502         hdr->nexthdr = proto;
1503         ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1504         ipv6_addr_copy(&hdr->daddr, final_dst);
1505
1506         skb->priority = sk->sk_priority;
1507         skb->mark = sk->sk_mark;
1508
1509         skb_dst_set(skb, dst_clone(&rt->u.dst));
1510         IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1511         if (proto == IPPROTO_ICMPV6) {
1512                 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1513
1514                 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1515                 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1516         }
1517
1518         err = ip6_local_out(skb);
1519         if (err) {
1520                 if (err > 0)
1521                         err = net_xmit_errno(err);
1522                 if (err)
1523                         goto error;
1524         }
1525
1526 out:
1527         ip6_cork_release(inet, np);
1528         return err;
1529 error:
1530         IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1531         goto out;
1532 }
1533
1534 void ip6_flush_pending_frames(struct sock *sk)
1535 {
1536         struct sk_buff *skb;
1537
1538         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1539                 if (skb_dst(skb))
1540                         IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1541                                       IPSTATS_MIB_OUTDISCARDS);
1542                 kfree_skb(skb);
1543         }
1544
1545         ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1546 }