]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv6/ip6_output.c
ipv6: allow to send packet after receiving ICMPv6 Too Big message with MTU field...
[net-next-2.6.git] / net / ipv6 / ip6_output.c
CommitLineData
1da177e4
LT
1/*
2 * IPv6 output functions
1ab1457c 3 * Linux INET6 implementation
1da177e4
LT
4 *
5 * Authors:
1ab1457c 6 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 7 *
1da177e4
LT
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
1da177e4 29#include <linux/errno.h>
ef76bc23 30#include <linux/kernel.h>
1da177e4
LT
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
b59f45d0 39#include <linux/module.h>
5a0e3ad6 40#include <linux/slab.h>
1da177e4
LT
41
42#include <linux/netfilter.h>
43#include <linux/netfilter_ipv6.h>
44
45#include <net/sock.h>
46#include <net/snmp.h>
47
48#include <net/ipv6.h>
49#include <net/ndisc.h>
50#include <net/protocol.h>
51#include <net/ip6_route.h>
52#include <net/addrconf.h>
53#include <net/rawv6.h>
54#include <net/icmp.h>
55#include <net/xfrm.h>
56#include <net/checksum.h>
7bc570c8 57#include <linux/mroute6.h>
1da177e4
LT
58
59static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
ef76bc23
HX
61int __ip6_local_out(struct sk_buff *skb)
62{
63 int len;
64
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
67 len = 0;
68 ipv6_hdr(skb)->payload_len = htons(len);
69
adf30907 70 return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
ef76bc23
HX
71 dst_output);
72}
73
74int ip6_local_out(struct sk_buff *skb)
75{
76 int err;
77
78 err = __ip6_local_out(skb);
79 if (likely(err == 1))
80 err = dst_output(skb);
81
82 return err;
83}
84EXPORT_SYMBOL_GPL(ip6_local_out);
85
ad643a79 86static int ip6_output_finish(struct sk_buff *skb)
1da177e4 87{
adf30907 88 struct dst_entry *dst = skb_dst(skb);
1da177e4 89
3644f0ce
SH
90 if (dst->hh)
91 return neigh_hh_output(dst->hh, skb);
92 else if (dst->neighbour)
1da177e4
LT
93 return dst->neighbour->output(skb);
94
483a47d2
DL
95 IP6_INC_STATS_BH(dev_net(dst->dev),
96 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
97 kfree_skb(skb);
98 return -EINVAL;
99
100}
101
102/* dev_loopback_xmit for use with netfilter. */
103static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
104{
459a98ed 105 skb_reset_mac_header(newskb);
bbe735e4 106 __skb_pull(newskb, skb_network_offset(newskb));
1da177e4
LT
107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
adf30907 109 WARN_ON(!skb_dst(newskb));
1da177e4 110
e30b38c2 111 netif_rx_ni(newskb);
1da177e4
LT
112 return 0;
113}
114
115
116static int ip6_output2(struct sk_buff *skb)
117{
adf30907 118 struct dst_entry *dst = skb_dst(skb);
1da177e4
LT
119 struct net_device *dev = dst->dev;
120
121 skb->protocol = htons(ETH_P_IPV6);
122 skb->dev = dev;
123
0660e03f 124 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
adf30907 125 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1da177e4 126
7ad6848c 127 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
bd91b8bf
BT
128 ((mroute6_socket(dev_net(dev)) &&
129 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
7bc570c8
YH
130 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
131 &ipv6_hdr(skb)->saddr))) {
1da177e4
LT
132 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
133
134 /* Do not check for IFF_ALLMULTI; multicast routing
135 is not supported in any case.
136 */
137 if (newskb)
6e23ae2a
PM
138 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
139 NULL, newskb->dev,
1da177e4
LT
140 ip6_dev_loopback_xmit);
141
0660e03f 142 if (ipv6_hdr(skb)->hop_limit == 0) {
3bd653c8
DL
143 IP6_INC_STATS(dev_net(dev), idev,
144 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
145 kfree_skb(skb);
146 return 0;
147 }
148 }
149
edf391ff
NH
150 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
151 skb->len);
1da177e4
LT
152 }
153
6e23ae2a
PM
154 return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
155 ip6_output_finish);
1da177e4
LT
156}
157
628a5c56
JH
158static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
159{
160 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
161
162 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
adf30907 163 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
628a5c56
JH
164}
165
1da177e4
LT
166int ip6_output(struct sk_buff *skb)
167{
adf30907 168 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
778d80be 169 if (unlikely(idev->cnf.disable_ipv6)) {
adf30907 170 IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
3bd653c8 171 IPSTATS_MIB_OUTDISCARDS);
778d80be
YH
172 kfree_skb(skb);
173 return 0;
174 }
175
628a5c56 176 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
adf30907 177 dst_allfrag(skb_dst(skb)))
1da177e4
LT
178 return ip6_fragment(skb, ip6_output2);
179 else
180 return ip6_output2(skb);
181}
182
1da177e4
LT
183/*
184 * xmit an sk_buff (used by TCP)
185 */
186
187int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
188 struct ipv6_txoptions *opt, int ipfragok)
189{
3bd653c8 190 struct net *net = sock_net(sk);
b30bd282 191 struct ipv6_pinfo *np = inet6_sk(sk);
1da177e4 192 struct in6_addr *first_hop = &fl->fl6_dst;
adf30907 193 struct dst_entry *dst = skb_dst(skb);
1da177e4
LT
194 struct ipv6hdr *hdr;
195 u8 proto = fl->proto;
196 int seg_len = skb->len;
e651f03a
GR
197 int hlimit = -1;
198 int tclass = 0;
1da177e4
LT
199 u32 mtu;
200
201 if (opt) {
c2636b4d 202 unsigned int head_room;
1da177e4
LT
203
204 /* First: exthdrs may take lots of space (~8K for now)
205 MAX_HEADER is not enough.
206 */
207 head_room = opt->opt_nflen + opt->opt_flen;
208 seg_len += head_room;
209 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
210
211 if (skb_headroom(skb) < head_room) {
212 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
a11d206d 213 if (skb2 == NULL) {
adf30907 214 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d
YH
215 IPSTATS_MIB_OUTDISCARDS);
216 kfree_skb(skb);
1da177e4
LT
217 return -ENOBUFS;
218 }
a11d206d
YH
219 kfree_skb(skb);
220 skb = skb2;
1da177e4
LT
221 if (sk)
222 skb_set_owner_w(skb, sk);
223 }
224 if (opt->opt_flen)
225 ipv6_push_frag_opts(skb, opt, &proto);
226 if (opt->opt_nflen)
227 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
228 }
229
e2d1bca7
ACM
230 skb_push(skb, sizeof(struct ipv6hdr));
231 skb_reset_network_header(skb);
0660e03f 232 hdr = ipv6_hdr(skb);
1da177e4 233
77e2f14f
WY
234 /* Allow local fragmentation. */
235 if (ipfragok)
236 skb->local_df = 1;
237
1da177e4
LT
238 /*
239 * Fill in the IPv6 header
240 */
e651f03a
GR
241 if (np) {
242 tclass = np->tclass;
1da177e4 243 hlimit = np->hop_limit;
e651f03a 244 }
1da177e4 245 if (hlimit < 0)
6b75d090 246 hlimit = ip6_dst_hoplimit(dst);
1da177e4 247
90bcaf7b 248 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
41a1f8ea 249
1da177e4
LT
250 hdr->payload_len = htons(seg_len);
251 hdr->nexthdr = proto;
252 hdr->hop_limit = hlimit;
253
254 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
255 ipv6_addr_copy(&hdr->daddr, first_hop);
256
a2c2064f 257 skb->priority = sk->sk_priority;
4a19ec58 258 skb->mark = sk->sk_mark;
a2c2064f 259
1da177e4 260 mtu = dst_mtu(dst);
283d07ac 261 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
adf30907 262 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
edf391ff 263 IPSTATS_MIB_OUT, skb->len);
6e23ae2a 264 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
6869c4d8 265 dst_output);
1da177e4
LT
266 }
267
268 if (net_ratelimit())
269 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
270 skb->dev = dst->dev;
3ffe533c 271 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
adf30907 272 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
273 kfree_skb(skb);
274 return -EMSGSIZE;
275}
276
7159039a
YH
277EXPORT_SYMBOL(ip6_xmit);
278
1da177e4
LT
279/*
280 * To avoid extra problems ND packets are send through this
281 * routine. It's code duplication but I really want to avoid
282 * extra checks since ipv6_build_header is used by TCP (which
283 * is for us performance critical)
284 */
285
286int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
9acd9f3a 287 const struct in6_addr *saddr, const struct in6_addr *daddr,
1da177e4
LT
288 int proto, int len)
289{
290 struct ipv6_pinfo *np = inet6_sk(sk);
291 struct ipv6hdr *hdr;
292 int totlen;
293
294 skb->protocol = htons(ETH_P_IPV6);
295 skb->dev = dev;
296
297 totlen = len + sizeof(struct ipv6hdr);
298
55f79cc0
ACM
299 skb_reset_network_header(skb);
300 skb_put(skb, sizeof(struct ipv6hdr));
0660e03f 301 hdr = ipv6_hdr(skb);
1da177e4 302
ae08e1f0 303 *(__be32*)hdr = htonl(0x60000000);
1da177e4
LT
304
305 hdr->payload_len = htons(len);
306 hdr->nexthdr = proto;
307 hdr->hop_limit = np->hop_limit;
308
309 ipv6_addr_copy(&hdr->saddr, saddr);
310 ipv6_addr_copy(&hdr->daddr, daddr);
311
312 return 0;
313}
314
315static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
316{
317 struct ip6_ra_chain *ra;
318 struct sock *last = NULL;
319
320 read_lock(&ip6_ra_lock);
321 for (ra = ip6_ra_chain; ra; ra = ra->next) {
322 struct sock *sk = ra->sk;
0bd1b59b
AM
323 if (sk && ra->sel == sel &&
324 (!sk->sk_bound_dev_if ||
325 sk->sk_bound_dev_if == skb->dev->ifindex)) {
1da177e4
LT
326 if (last) {
327 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
328 if (skb2)
329 rawv6_rcv(last, skb2);
330 }
331 last = sk;
332 }
333 }
334
335 if (last) {
336 rawv6_rcv(last, skb);
337 read_unlock(&ip6_ra_lock);
338 return 1;
339 }
340 read_unlock(&ip6_ra_lock);
341 return 0;
342}
343
e21e0b5f
VN
344static int ip6_forward_proxy_check(struct sk_buff *skb)
345{
0660e03f 346 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f
VN
347 u8 nexthdr = hdr->nexthdr;
348 int offset;
349
350 if (ipv6_ext_hdr(nexthdr)) {
351 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
352 if (offset < 0)
353 return 0;
354 } else
355 offset = sizeof(struct ipv6hdr);
356
357 if (nexthdr == IPPROTO_ICMPV6) {
358 struct icmp6hdr *icmp6;
359
d56f90a7
ACM
360 if (!pskb_may_pull(skb, (skb_network_header(skb) +
361 offset + 1 - skb->data)))
e21e0b5f
VN
362 return 0;
363
d56f90a7 364 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
365
366 switch (icmp6->icmp6_type) {
367 case NDISC_ROUTER_SOLICITATION:
368 case NDISC_ROUTER_ADVERTISEMENT:
369 case NDISC_NEIGHBOUR_SOLICITATION:
370 case NDISC_NEIGHBOUR_ADVERTISEMENT:
371 case NDISC_REDIRECT:
372 /* For reaction involving unicast neighbor discovery
373 * message destined to the proxied address, pass it to
374 * input function.
375 */
376 return 1;
377 default:
378 break;
379 }
380 }
381
74553b09
VN
382 /*
383 * The proxying router can't forward traffic sent to a link-local
384 * address, so signal the sender and discard the packet. This
385 * behavior is clarified by the MIPv6 specification.
386 */
387 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
388 dst_link_failure(skb);
389 return -1;
390 }
391
e21e0b5f
VN
392 return 0;
393}
394
1da177e4
LT
395static inline int ip6_forward_finish(struct sk_buff *skb)
396{
397 return dst_output(skb);
398}
399
400int ip6_forward(struct sk_buff *skb)
401{
adf30907 402 struct dst_entry *dst = skb_dst(skb);
0660e03f 403 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 404 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 405 struct net *net = dev_net(dst->dev);
14f3ad6f 406 u32 mtu;
1ab1457c 407
53b7997f 408 if (net->ipv6.devconf_all->forwarding == 0)
1da177e4
LT
409 goto error;
410
4497b076
BH
411 if (skb_warn_if_lro(skb))
412 goto drop;
413
1da177e4 414 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
3bd653c8 415 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
1da177e4
LT
416 goto drop;
417 }
418
35fc92a9 419 skb_forward_csum(skb);
1da177e4
LT
420
421 /*
422 * We DO NOT make any processing on
423 * RA packets, pushing them to user level AS IS
424 * without ane WARRANTY that application will be able
425 * to interpret them. The reason is that we
426 * cannot make anything clever here.
427 *
428 * We are not end-node, so that if packet contains
429 * AH/ESP, we cannot make anything.
430 * Defragmentation also would be mistake, RA packets
431 * cannot be fragmented, because there is no warranty
432 * that different fragments will go along one path. --ANK
433 */
434 if (opt->ra) {
d56f90a7 435 u8 *ptr = skb_network_header(skb) + opt->ra;
1da177e4
LT
436 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
437 return 0;
438 }
439
440 /*
441 * check and decrement ttl
442 */
443 if (hdr->hop_limit <= 1) {
444 /* Force OUTPUT device used as source address */
445 skb->dev = dst->dev;
3ffe533c 446 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
483a47d2
DL
447 IP6_INC_STATS_BH(net,
448 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
1da177e4
LT
449
450 kfree_skb(skb);
451 return -ETIMEDOUT;
452 }
453
fbea49e1 454 /* XXX: idev->cnf.proxy_ndp? */
53b7997f 455 if (net->ipv6.devconf_all->proxy_ndp &&
8a3edd80 456 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09
VN
457 int proxied = ip6_forward_proxy_check(skb);
458 if (proxied > 0)
e21e0b5f 459 return ip6_input(skb);
74553b09 460 else if (proxied < 0) {
3bd653c8
DL
461 IP6_INC_STATS(net, ip6_dst_idev(dst),
462 IPSTATS_MIB_INDISCARDS);
74553b09
VN
463 goto drop;
464 }
e21e0b5f
VN
465 }
466
1da177e4 467 if (!xfrm6_route_forward(skb)) {
3bd653c8 468 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
1da177e4
LT
469 goto drop;
470 }
adf30907 471 dst = skb_dst(skb);
1da177e4
LT
472
473 /* IPv6 specs say nothing about it, but it is clear that we cannot
474 send redirects to source routed frames.
1e5dc146 475 We don't send redirects to frames decapsulated from IPsec.
1da177e4 476 */
1e5dc146 477 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
def8b4fa 478 !skb_sec_path(skb)) {
1da177e4
LT
479 struct in6_addr *target = NULL;
480 struct rt6_info *rt;
481 struct neighbour *n = dst->neighbour;
482
483 /*
484 * incoming and outgoing devices are the same
485 * send a redirect.
486 */
487
488 rt = (struct rt6_info *) dst;
489 if ((rt->rt6i_flags & RTF_GATEWAY))
490 target = (struct in6_addr*)&n->primary_key;
491 else
492 target = &hdr->daddr;
493
494 /* Limit redirects both by destination (here)
495 and by source (inside ndisc_send_redirect)
496 */
497 if (xrlim_allow(dst, 1*HZ))
498 ndisc_send_redirect(skb, n, target);
5bb1ab09
DS
499 } else {
500 int addrtype = ipv6_addr_type(&hdr->saddr);
501
1da177e4 502 /* This check is security critical. */
f81b2e7d
YH
503 if (addrtype == IPV6_ADDR_ANY ||
504 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
505 goto error;
506 if (addrtype & IPV6_ADDR_LINKLOCAL) {
507 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
3ffe533c 508 ICMPV6_NOT_NEIGHBOUR, 0);
5bb1ab09
DS
509 goto error;
510 }
1da177e4
LT
511 }
512
14f3ad6f
UW
513 mtu = dst_mtu(dst);
514 if (mtu < IPV6_MIN_MTU)
515 mtu = IPV6_MIN_MTU;
516
517 if (skb->len > mtu) {
1da177e4
LT
518 /* Again, force OUTPUT device used as source address */
519 skb->dev = dst->dev;
14f3ad6f 520 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
483a47d2
DL
521 IP6_INC_STATS_BH(net,
522 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
523 IP6_INC_STATS_BH(net,
524 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
525 kfree_skb(skb);
526 return -EMSGSIZE;
527 }
528
529 if (skb_cow(skb, dst->dev->hard_header_len)) {
3bd653c8 530 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
531 goto drop;
532 }
533
0660e03f 534 hdr = ipv6_hdr(skb);
1da177e4
LT
535
536 /* Mangling hops number delayed to point after skb COW */
1ab1457c 537
1da177e4
LT
538 hdr->hop_limit--;
539
483a47d2 540 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
6e23ae2a
PM
541 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
542 ip6_forward_finish);
1da177e4
LT
543
544error:
483a47d2 545 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
1da177e4
LT
546drop:
547 kfree_skb(skb);
548 return -EINVAL;
549}
550
551static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
552{
553 to->pkt_type = from->pkt_type;
554 to->priority = from->priority;
555 to->protocol = from->protocol;
adf30907
ED
556 skb_dst_drop(to);
557 skb_dst_set(to, dst_clone(skb_dst(from)));
1da177e4 558 to->dev = from->dev;
82e91ffe 559 to->mark = from->mark;
1da177e4
LT
560
561#ifdef CONFIG_NET_SCHED
562 to->tc_index = from->tc_index;
563#endif
e7ac05f3 564 nf_copy(to, from);
ba9dda3a
JK
565#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
566 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
567 to->nf_trace = from->nf_trace;
568#endif
984bc16c 569 skb_copy_secmark(to, from);
1da177e4
LT
570}
571
572int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
573{
574 u16 offset = sizeof(struct ipv6hdr);
0660e03f
ACM
575 struct ipv6_opt_hdr *exthdr =
576 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
27a884dc 577 unsigned int packet_len = skb->tail - skb->network_header;
1da177e4 578 int found_rhdr = 0;
0660e03f 579 *nexthdr = &ipv6_hdr(skb)->nexthdr;
1da177e4
LT
580
581 while (offset + 1 <= packet_len) {
582
583 switch (**nexthdr) {
584
585 case NEXTHDR_HOP:
27637df9 586 break;
1da177e4 587 case NEXTHDR_ROUTING:
27637df9
MN
588 found_rhdr = 1;
589 break;
1da177e4 590 case NEXTHDR_DEST:
59fbb3a6 591#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
27637df9
MN
592 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
593 break;
594#endif
595 if (found_rhdr)
596 return offset;
1da177e4
LT
597 break;
598 default :
599 return offset;
600 }
27637df9
MN
601
602 offset += ipv6_optlen(exthdr);
603 *nexthdr = &exthdr->nexthdr;
d56f90a7
ACM
604 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
605 offset);
1da177e4
LT
606 }
607
608 return offset;
609}
610
611static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
612{
1da177e4 613 struct sk_buff *frag;
adf30907 614 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
d91675f9 615 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
1da177e4
LT
616 struct ipv6hdr *tmp_hdr;
617 struct frag_hdr *fh;
618 unsigned int mtu, hlen, left, len;
ae08e1f0 619 __be32 frag_id = 0;
1da177e4
LT
620 int ptr, offset = 0, err=0;
621 u8 *prevhdr, nexthdr = 0;
adf30907 622 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4 623
1da177e4
LT
624 hlen = ip6_find_1stfragopt(skb, &prevhdr);
625 nexthdr = *prevhdr;
626
628a5c56 627 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
628
629 /* We must not fragment if the socket is set to force MTU discovery
14f3ad6f 630 * or if the skb it not generated by a local socket.
b881ef76 631 */
f2228f78 632 if (!skb->local_df && skb->len > mtu) {
adf30907 633 skb->dev = skb_dst(skb)->dev;
3ffe533c 634 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
adf30907 635 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 636 IPSTATS_MIB_FRAGFAILS);
b881ef76
JH
637 kfree_skb(skb);
638 return -EMSGSIZE;
639 }
640
d91675f9
YH
641 if (np && np->frag_size < mtu) {
642 if (np->frag_size)
643 mtu = np->frag_size;
644 }
645 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4 646
4d9092bb 647 if (skb_has_frags(skb)) {
1da177e4 648 int first_len = skb_pagelen(skb);
29ffe1a5 649 int truesizes = 0;
1da177e4
LT
650
651 if (first_len - hlen > mtu ||
652 ((first_len - hlen) & 7) ||
653 skb_cloned(skb))
654 goto slow_path;
655
4d9092bb 656 skb_walk_frags(skb, frag) {
1da177e4
LT
657 /* Correct geometry. */
658 if (frag->len > mtu ||
659 ((frag->len & 7) && frag->next) ||
660 skb_headroom(frag) < hlen)
661 goto slow_path;
662
1da177e4
LT
663 /* Partially cloned skb? */
664 if (skb_shared(frag))
665 goto slow_path;
2fdba6b0
HX
666
667 BUG_ON(frag->sk);
668 if (skb->sk) {
2fdba6b0
HX
669 frag->sk = skb->sk;
670 frag->destructor = sock_wfree;
29ffe1a5 671 truesizes += frag->truesize;
2fdba6b0 672 }
1da177e4
LT
673 }
674
675 err = 0;
676 offset = 0;
677 frag = skb_shinfo(skb)->frag_list;
4d9092bb 678 skb_frag_list_init(skb);
1da177e4
LT
679 /* BUILD HEADER */
680
9a217a1c 681 *prevhdr = NEXTHDR_FRAGMENT;
d56f90a7 682 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
1da177e4 683 if (!tmp_hdr) {
adf30907 684 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 685 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
686 return -ENOMEM;
687 }
688
1da177e4
LT
689 __skb_pull(skb, hlen);
690 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
e2d1bca7
ACM
691 __skb_push(skb, hlen);
692 skb_reset_network_header(skb);
d56f90a7 693 memcpy(skb_network_header(skb), tmp_hdr, hlen);
1da177e4 694
7ea2f2c5 695 ipv6_select_ident(fh);
1da177e4
LT
696 fh->nexthdr = nexthdr;
697 fh->reserved = 0;
698 fh->frag_off = htons(IP6_MF);
699 frag_id = fh->identification;
700
701 first_len = skb_pagelen(skb);
702 skb->data_len = first_len - skb_headlen(skb);
29ffe1a5 703 skb->truesize -= truesizes;
1da177e4 704 skb->len = first_len;
0660e03f
ACM
705 ipv6_hdr(skb)->payload_len = htons(first_len -
706 sizeof(struct ipv6hdr));
a11d206d
YH
707
708 dst_hold(&rt->u.dst);
1da177e4
LT
709
710 for (;;) {
711 /* Prepare header of the next frame,
712 * before previous one went down. */
713 if (frag) {
714 frag->ip_summed = CHECKSUM_NONE;
badff6d0 715 skb_reset_transport_header(frag);
1da177e4 716 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
e2d1bca7
ACM
717 __skb_push(frag, hlen);
718 skb_reset_network_header(frag);
d56f90a7
ACM
719 memcpy(skb_network_header(frag), tmp_hdr,
720 hlen);
1da177e4
LT
721 offset += skb->len - hlen - sizeof(struct frag_hdr);
722 fh->nexthdr = nexthdr;
723 fh->reserved = 0;
724 fh->frag_off = htons(offset);
725 if (frag->next != NULL)
726 fh->frag_off |= htons(IP6_MF);
727 fh->identification = frag_id;
0660e03f
ACM
728 ipv6_hdr(frag)->payload_len =
729 htons(frag->len -
730 sizeof(struct ipv6hdr));
1da177e4
LT
731 ip6_copy_metadata(frag, skb);
732 }
1ab1457c 733
1da177e4 734 err = output(skb);
dafee490 735 if(!err)
3bd653c8
DL
736 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
737 IPSTATS_MIB_FRAGCREATES);
dafee490 738
1da177e4
LT
739 if (err || !frag)
740 break;
741
742 skb = frag;
743 frag = skb->next;
744 skb->next = NULL;
745 }
746
a51482bd 747 kfree(tmp_hdr);
1da177e4
LT
748
749 if (err == 0) {
3bd653c8
DL
750 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
751 IPSTATS_MIB_FRAGOKS);
a11d206d 752 dst_release(&rt->u.dst);
1da177e4
LT
753 return 0;
754 }
755
756 while (frag) {
757 skb = frag->next;
758 kfree_skb(frag);
759 frag = skb;
760 }
761
3bd653c8
DL
762 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
763 IPSTATS_MIB_FRAGFAILS);
a11d206d 764 dst_release(&rt->u.dst);
1da177e4
LT
765 return err;
766 }
767
768slow_path:
769 left = skb->len - hlen; /* Space per frame */
770 ptr = hlen; /* Where to start from */
771
772 /*
773 * Fragment the datagram.
774 */
775
776 *prevhdr = NEXTHDR_FRAGMENT;
777
778 /*
779 * Keep copying data until we run out.
780 */
781 while(left > 0) {
782 len = left;
783 /* IF: it doesn't fit, use 'mtu' - the data space left */
784 if (len > mtu)
785 len = mtu;
786 /* IF: we are not sending upto and including the packet end
787 then align the next start on an eight byte boundary */
788 if (len < left) {
789 len &= ~7;
790 }
791 /*
792 * Allocate buffer.
793 */
794
f5184d26 795 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
64ce2073 796 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
adf30907 797 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 798 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
799 err = -ENOMEM;
800 goto fail;
801 }
802
803 /*
804 * Set up data on packet
805 */
806
807 ip6_copy_metadata(frag, skb);
808 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
809 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
c1d2bbe1 810 skb_reset_network_header(frag);
badff6d0 811 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
b0e380b1
ACM
812 frag->transport_header = (frag->network_header + hlen +
813 sizeof(struct frag_hdr));
1da177e4
LT
814
815 /*
816 * Charge the memory for the fragment to any owner
817 * it might possess
818 */
819 if (skb->sk)
820 skb_set_owner_w(frag, skb->sk);
821
822 /*
823 * Copy the packet header into the new buffer.
824 */
d626f62b 825 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
1da177e4
LT
826
827 /*
828 * Build fragment header.
829 */
830 fh->nexthdr = nexthdr;
831 fh->reserved = 0;
f36d6ab1 832 if (!frag_id) {
7ea2f2c5 833 ipv6_select_ident(fh);
1da177e4
LT
834 frag_id = fh->identification;
835 } else
836 fh->identification = frag_id;
837
838 /*
839 * Copy a block of the IP datagram.
840 */
8984e41d 841 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
1da177e4
LT
842 BUG();
843 left -= len;
844
845 fh->frag_off = htons(offset);
846 if (left > 0)
847 fh->frag_off |= htons(IP6_MF);
0660e03f
ACM
848 ipv6_hdr(frag)->payload_len = htons(frag->len -
849 sizeof(struct ipv6hdr));
1da177e4
LT
850
851 ptr += len;
852 offset += len;
853
854 /*
855 * Put this fragment into the sending queue.
856 */
1da177e4
LT
857 err = output(frag);
858 if (err)
859 goto fail;
dafee490 860
adf30907 861 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 862 IPSTATS_MIB_FRAGCREATES);
1da177e4 863 }
adf30907 864 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 865 IPSTATS_MIB_FRAGOKS);
1da177e4 866 kfree_skb(skb);
1da177e4
LT
867 return err;
868
869fail:
adf30907 870 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 871 IPSTATS_MIB_FRAGFAILS);
1ab1457c 872 kfree_skb(skb);
1da177e4
LT
873 return err;
874}
875
cf6b1982
YH
876static inline int ip6_rt_check(struct rt6key *rt_key,
877 struct in6_addr *fl_addr,
878 struct in6_addr *addr_cache)
879{
880 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
881 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
882}
883
497c615a
HX
884static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
885 struct dst_entry *dst,
886 struct flowi *fl)
1da177e4 887{
497c615a
HX
888 struct ipv6_pinfo *np = inet6_sk(sk);
889 struct rt6_info *rt = (struct rt6_info *)dst;
1da177e4 890
497c615a
HX
891 if (!dst)
892 goto out;
893
894 /* Yes, checking route validity in not connected
895 * case is not very simple. Take into account,
896 * that we do not support routing by source, TOS,
897 * and MSG_DONTROUTE --ANK (980726)
898 *
cf6b1982
YH
899 * 1. ip6_rt_check(): If route was host route,
900 * check that cached destination is current.
497c615a
HX
901 * If it is network route, we still may
902 * check its validity using saved pointer
903 * to the last used address: daddr_cache.
904 * We do not want to save whole address now,
905 * (because main consumer of this service
906 * is tcp, which has not this problem),
907 * so that the last trick works only on connected
908 * sockets.
909 * 2. oif also should be the same.
910 */
cf6b1982 911 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
8e1ef0a9
YH
912#ifdef CONFIG_IPV6_SUBTREES
913 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
914#endif
cf6b1982 915 (fl->oif && fl->oif != dst->dev->ifindex)) {
497c615a
HX
916 dst_release(dst);
917 dst = NULL;
1da177e4
LT
918 }
919
497c615a
HX
920out:
921 return dst;
922}
923
924static int ip6_dst_lookup_tail(struct sock *sk,
925 struct dst_entry **dst, struct flowi *fl)
926{
927 int err;
3b1e0a65 928 struct net *net = sock_net(sk);
497c615a 929
1da177e4 930 if (*dst == NULL)
8a3edd80 931 *dst = ip6_route_output(net, sk, fl);
1da177e4
LT
932
933 if ((err = (*dst)->error))
934 goto out_err_release;
935
936 if (ipv6_addr_any(&fl->fl6_src)) {
191cd582 937 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
7cbca67c
YH
938 &fl->fl6_dst,
939 sk ? inet6_sk(sk)->srcprefs : 0,
940 &fl->fl6_src);
44456d37 941 if (err)
1da177e4 942 goto out_err_release;
1da177e4
LT
943 }
944
95c385b4 945#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
e550dfb0
NH
946 /*
947 * Here if the dst entry we've looked up
948 * has a neighbour entry that is in the INCOMPLETE
949 * state and the src address from the flow is
950 * marked as OPTIMISTIC, we release the found
951 * dst entry and replace it instead with the
952 * dst entry of the nexthop router
953 */
954 if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
955 struct inet6_ifaddr *ifp;
956 struct flowi fl_gw;
957 int redirect;
958
959 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
960 (*dst)->dev, 1);
961
962 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
963 if (ifp)
964 in6_ifa_put(ifp);
965
966 if (redirect) {
967 /*
968 * We need to get the dst entry for the
969 * default router instead
970 */
971 dst_release(*dst);
972 memcpy(&fl_gw, fl, sizeof(struct flowi));
973 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
974 *dst = ip6_route_output(net, sk, &fl_gw);
975 if ((err = (*dst)->error))
976 goto out_err_release;
95c385b4 977 }
e550dfb0 978 }
95c385b4
NH
979#endif
980
1da177e4
LT
981 return 0;
982
983out_err_release:
ca46f9c8 984 if (err == -ENETUNREACH)
483a47d2 985 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
986 dst_release(*dst);
987 *dst = NULL;
988 return err;
989}
34a0b3cd 990
497c615a
HX
991/**
992 * ip6_dst_lookup - perform route lookup on flow
993 * @sk: socket which provides route info
994 * @dst: pointer to dst_entry * for result
995 * @fl: flow to lookup
996 *
997 * This function performs a route lookup on the given flow.
998 *
999 * It returns zero on success, or a standard errno code on error.
1000 */
1001int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1002{
1003 *dst = NULL;
1004 return ip6_dst_lookup_tail(sk, dst, fl);
1005}
3cf3dc6c
ACM
1006EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1007
497c615a
HX
1008/**
1009 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
1010 * @sk: socket which provides the dst cache and route info
1011 * @dst: pointer to dst_entry * for result
1012 * @fl: flow to lookup
1013 *
1014 * This function performs a route lookup on the given flow with the
1015 * possibility of using the cached route in the socket if it is valid.
1016 * It will take the socket dst lock when operating on the dst cache.
1017 * As a result, this function can only be used in process context.
1018 *
1019 * It returns zero on success, or a standard errno code on error.
1020 */
1021int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1022{
1023 *dst = NULL;
1024 if (sk) {
1025 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1026 *dst = ip6_sk_dst_check(sk, *dst, fl);
1027 }
1028
1029 return ip6_dst_lookup_tail(sk, dst, fl);
1030}
1031EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1032
34a0b3cd 1033static inline int ip6_ufo_append_data(struct sock *sk,
e89e9cf5
AR
1034 int getfrag(void *from, char *to, int offset, int len,
1035 int odd, struct sk_buff *skb),
1036 void *from, int length, int hh_len, int fragheaderlen,
1037 int transhdrlen, int mtu,unsigned int flags)
1038
1039{
1040 struct sk_buff *skb;
1041 int err;
1042
1043 /* There is support for UDP large send offload by network
1044 * device, so create one single skb packet containing complete
1045 * udp datagram
1046 */
1047 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1048 skb = sock_alloc_send_skb(sk,
1049 hh_len + fragheaderlen + transhdrlen + 20,
1050 (flags & MSG_DONTWAIT), &err);
1051 if (skb == NULL)
1052 return -ENOMEM;
1053
1054 /* reserve space for Hardware header */
1055 skb_reserve(skb, hh_len);
1056
1057 /* create space for UDP/IP header */
1058 skb_put(skb,fragheaderlen + transhdrlen);
1059
1060 /* initialize network header pointer */
c1d2bbe1 1061 skb_reset_network_header(skb);
e89e9cf5
AR
1062
1063 /* initialize protocol header pointer */
b0e380b1 1064 skb->transport_header = skb->network_header + fragheaderlen;
e89e9cf5 1065
84fa7933 1066 skb->ip_summed = CHECKSUM_PARTIAL;
e89e9cf5
AR
1067 skb->csum = 0;
1068 sk->sk_sndmsg_off = 0;
1069 }
1070
1071 err = skb_append_datato_frags(sk,skb, getfrag, from,
1072 (length - transhdrlen));
1073 if (!err) {
1074 struct frag_hdr fhdr;
1075
c31d5326
SS
1076 /* Specify the length of each IPv6 datagram fragment.
1077 * It has to be a multiple of 8.
1078 */
1079 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1080 sizeof(struct frag_hdr)) & ~7;
f83ef8c0 1081 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
7ea2f2c5 1082 ipv6_select_ident(&fhdr);
e89e9cf5
AR
1083 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1084 __skb_queue_tail(&sk->sk_write_queue, skb);
1085
1086 return 0;
1087 }
1088 /* There is not enough support do UPD LSO,
1089 * so follow normal path
1090 */
1091 kfree_skb(skb);
1092
1093 return err;
1094}
1da177e4 1095
0178b695
HX
1096static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1097 gfp_t gfp)
1098{
1099 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1100}
1101
1102static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1103 gfp_t gfp)
1104{
1105 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1106}
1107
41a1f8ea
YH
1108int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1109 int offset, int len, int odd, struct sk_buff *skb),
1110 void *from, int length, int transhdrlen,
1111 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1112 struct rt6_info *rt, unsigned int flags)
1da177e4
LT
1113{
1114 struct inet_sock *inet = inet_sk(sk);
1115 struct ipv6_pinfo *np = inet6_sk(sk);
1116 struct sk_buff *skb;
1117 unsigned int maxfraglen, fragheaderlen;
1118 int exthdrlen;
1119 int hh_len;
1120 int mtu;
1121 int copy;
1122 int err;
1123 int offset = 0;
1124 int csummode = CHECKSUM_NONE;
1125
1126 if (flags&MSG_PROBE)
1127 return 0;
1128 if (skb_queue_empty(&sk->sk_write_queue)) {
1129 /*
1130 * setup for corking
1131 */
1132 if (opt) {
0178b695 1133 if (WARN_ON(np->cork.opt))
1da177e4 1134 return -EINVAL;
0178b695
HX
1135
1136 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1137 if (unlikely(np->cork.opt == NULL))
1138 return -ENOBUFS;
1139
1140 np->cork.opt->tot_len = opt->tot_len;
1141 np->cork.opt->opt_flen = opt->opt_flen;
1142 np->cork.opt->opt_nflen = opt->opt_nflen;
1143
1144 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1145 sk->sk_allocation);
1146 if (opt->dst0opt && !np->cork.opt->dst0opt)
1147 return -ENOBUFS;
1148
1149 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1150 sk->sk_allocation);
1151 if (opt->dst1opt && !np->cork.opt->dst1opt)
1152 return -ENOBUFS;
1153
1154 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1155 sk->sk_allocation);
1156 if (opt->hopopt && !np->cork.opt->hopopt)
1157 return -ENOBUFS;
1158
1159 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1160 sk->sk_allocation);
1161 if (opt->srcrt && !np->cork.opt->srcrt)
1162 return -ENOBUFS;
1163
1da177e4
LT
1164 /* need source address above miyazawa*/
1165 }
1166 dst_hold(&rt->u.dst);
c8cdaf99 1167 inet->cork.dst = &rt->u.dst;
1da177e4
LT
1168 inet->cork.fl = *fl;
1169 np->cork.hop_limit = hlimit;
41a1f8ea 1170 np->cork.tclass = tclass;
628a5c56
JH
1171 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1172 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
c7503609 1173 if (np->frag_size < mtu) {
d91675f9
YH
1174 if (np->frag_size)
1175 mtu = np->frag_size;
1176 }
1177 inet->cork.fragsize = mtu;
1da177e4
LT
1178 if (dst_allfrag(rt->u.dst.path))
1179 inet->cork.flags |= IPCORK_ALLFRAG;
1180 inet->cork.length = 0;
1181 sk->sk_sndmsg_page = NULL;
1182 sk->sk_sndmsg_off = 0;
01488942 1183 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
a1b05140 1184 rt->rt6i_nfheader_len;
1da177e4
LT
1185 length += exthdrlen;
1186 transhdrlen += exthdrlen;
1187 } else {
c8cdaf99 1188 rt = (struct rt6_info *)inet->cork.dst;
1da177e4 1189 fl = &inet->cork.fl;
0178b695 1190 opt = np->cork.opt;
1da177e4
LT
1191 transhdrlen = 0;
1192 exthdrlen = 0;
1193 mtu = inet->cork.fragsize;
1194 }
1195
1196 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1197
a1b05140 1198 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1199 (opt ? opt->opt_nflen : 0);
1da177e4
LT
1200 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1201
1202 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1203 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1204 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1205 return -EMSGSIZE;
1206 }
1207 }
1208
1209 /*
1210 * Let's try using as much space as possible.
1211 * Use MTU if total length of the message fits into the MTU.
1212 * Otherwise, we need to reserve fragment header and
1213 * fragment alignment (= 8-15 octects, in total).
1214 *
1215 * Note that we may need to "move" the data from the tail of
1ab1457c 1216 * of the buffer to the new fragment when we split
1da177e4
LT
1217 * the message.
1218 *
1ab1457c 1219 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1220 * at once if non-fragmentable extension headers
1221 * are too large.
1ab1457c 1222 * --yoshfuji
1da177e4
LT
1223 */
1224
1225 inet->cork.length += length;
e89e9cf5
AR
1226 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1227 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1228
baa829d8
PM
1229 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1230 fragheaderlen, transhdrlen, mtu,
1231 flags);
1232 if (err)
e89e9cf5 1233 goto error;
e89e9cf5
AR
1234 return 0;
1235 }
1da177e4
LT
1236
1237 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1238 goto alloc_new_skb;
1239
1240 while (length > 0) {
1241 /* Check if the remaining data fits into current packet. */
1242 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1243 if (copy < length)
1244 copy = maxfraglen - skb->len;
1245
1246 if (copy <= 0) {
1247 char *data;
1248 unsigned int datalen;
1249 unsigned int fraglen;
1250 unsigned int fraggap;
1251 unsigned int alloclen;
1252 struct sk_buff *skb_prev;
1253alloc_new_skb:
1254 skb_prev = skb;
1255
1256 /* There's no room in the current skb */
1257 if (skb_prev)
1258 fraggap = skb_prev->len - maxfraglen;
1259 else
1260 fraggap = 0;
1261
1262 /*
1263 * If remaining data exceeds the mtu,
1264 * we know we need more fragment(s).
1265 */
1266 datalen = length + fraggap;
1267 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1268 datalen = maxfraglen - fragheaderlen;
1269
1270 fraglen = datalen + fragheaderlen;
1271 if ((flags & MSG_MORE) &&
1272 !(rt->u.dst.dev->features&NETIF_F_SG))
1273 alloclen = mtu;
1274 else
1275 alloclen = datalen + fragheaderlen;
1276
1277 /*
1278 * The last fragment gets additional space at tail.
1279 * Note: we overallocate on fragments with MSG_MODE
1280 * because we have no idea if we're the last one.
1281 */
1282 if (datalen == length + fraggap)
1283 alloclen += rt->u.dst.trailer_len;
1284
1285 /*
1286 * We just reserve space for fragment header.
1ab1457c 1287 * Note: this may be overallocation if the message
1da177e4
LT
1288 * (without MSG_MORE) fits into the MTU.
1289 */
1290 alloclen += sizeof(struct frag_hdr);
1291
1292 if (transhdrlen) {
1293 skb = sock_alloc_send_skb(sk,
1294 alloclen + hh_len,
1295 (flags & MSG_DONTWAIT), &err);
1296 } else {
1297 skb = NULL;
1298 if (atomic_read(&sk->sk_wmem_alloc) <=
1299 2 * sk->sk_sndbuf)
1300 skb = sock_wmalloc(sk,
1301 alloclen + hh_len, 1,
1302 sk->sk_allocation);
1303 if (unlikely(skb == NULL))
1304 err = -ENOBUFS;
1305 }
1306 if (skb == NULL)
1307 goto error;
1308 /*
1309 * Fill in the control structures
1310 */
1311 skb->ip_summed = csummode;
1312 skb->csum = 0;
1313 /* reserve for fragmentation */
1314 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1315
1316 /*
1317 * Find where to start putting bytes
1318 */
1319 data = skb_put(skb, fraglen);
c14d2450 1320 skb_set_network_header(skb, exthdrlen);
1da177e4 1321 data += fragheaderlen;
b0e380b1
ACM
1322 skb->transport_header = (skb->network_header +
1323 fragheaderlen);
1da177e4
LT
1324 if (fraggap) {
1325 skb->csum = skb_copy_and_csum_bits(
1326 skb_prev, maxfraglen,
1327 data + transhdrlen, fraggap, 0);
1328 skb_prev->csum = csum_sub(skb_prev->csum,
1329 skb->csum);
1330 data += fraggap;
e9fa4f7b 1331 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
1332 }
1333 copy = datalen - transhdrlen - fraggap;
1334 if (copy < 0) {
1335 err = -EINVAL;
1336 kfree_skb(skb);
1337 goto error;
1338 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1339 err = -EFAULT;
1340 kfree_skb(skb);
1341 goto error;
1342 }
1343
1344 offset += copy;
1345 length -= datalen - fraggap;
1346 transhdrlen = 0;
1347 exthdrlen = 0;
1348 csummode = CHECKSUM_NONE;
1349
1350 /*
1351 * Put the packet on the pending queue
1352 */
1353 __skb_queue_tail(&sk->sk_write_queue, skb);
1354 continue;
1355 }
1356
1357 if (copy > length)
1358 copy = length;
1359
1360 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1361 unsigned int off;
1362
1363 off = skb->len;
1364 if (getfrag(from, skb_put(skb, copy),
1365 offset, copy, off, skb) < 0) {
1366 __skb_trim(skb, off);
1367 err = -EFAULT;
1368 goto error;
1369 }
1370 } else {
1371 int i = skb_shinfo(skb)->nr_frags;
1372 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1373 struct page *page = sk->sk_sndmsg_page;
1374 int off = sk->sk_sndmsg_off;
1375 unsigned int left;
1376
1377 if (page && (left = PAGE_SIZE - off) > 0) {
1378 if (copy >= left)
1379 copy = left;
1380 if (page != frag->page) {
1381 if (i == MAX_SKB_FRAGS) {
1382 err = -EMSGSIZE;
1383 goto error;
1384 }
1385 get_page(page);
1386 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1387 frag = &skb_shinfo(skb)->frags[i];
1388 }
1389 } else if(i < MAX_SKB_FRAGS) {
1390 if (copy > PAGE_SIZE)
1391 copy = PAGE_SIZE;
1392 page = alloc_pages(sk->sk_allocation, 0);
1393 if (page == NULL) {
1394 err = -ENOMEM;
1395 goto error;
1396 }
1397 sk->sk_sndmsg_page = page;
1398 sk->sk_sndmsg_off = 0;
1399
1400 skb_fill_page_desc(skb, i, page, 0, 0);
1401 frag = &skb_shinfo(skb)->frags[i];
1da177e4
LT
1402 } else {
1403 err = -EMSGSIZE;
1404 goto error;
1405 }
1406 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1407 err = -EFAULT;
1408 goto error;
1409 }
1410 sk->sk_sndmsg_off += copy;
1411 frag->size += copy;
1412 skb->len += copy;
1413 skb->data_len += copy;
f945fa7a
HX
1414 skb->truesize += copy;
1415 atomic_add(copy, &sk->sk_wmem_alloc);
1da177e4
LT
1416 }
1417 offset += copy;
1418 length -= copy;
1419 }
1420 return 0;
1421error:
1422 inet->cork.length -= length;
3bd653c8 1423 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1424 return err;
1425}
1426
bf138862
PE
1427static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1428{
0178b695
HX
1429 if (np->cork.opt) {
1430 kfree(np->cork.opt->dst0opt);
1431 kfree(np->cork.opt->dst1opt);
1432 kfree(np->cork.opt->hopopt);
1433 kfree(np->cork.opt->srcrt);
1434 kfree(np->cork.opt);
1435 np->cork.opt = NULL;
1436 }
1437
c8cdaf99
YH
1438 if (inet->cork.dst) {
1439 dst_release(inet->cork.dst);
1440 inet->cork.dst = NULL;
bf138862
PE
1441 inet->cork.flags &= ~IPCORK_ALLFRAG;
1442 }
1443 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1444}
1445
1da177e4
LT
1446int ip6_push_pending_frames(struct sock *sk)
1447{
1448 struct sk_buff *skb, *tmp_skb;
1449 struct sk_buff **tail_skb;
1450 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1451 struct inet_sock *inet = inet_sk(sk);
1452 struct ipv6_pinfo *np = inet6_sk(sk);
3bd653c8 1453 struct net *net = sock_net(sk);
1da177e4
LT
1454 struct ipv6hdr *hdr;
1455 struct ipv6_txoptions *opt = np->cork.opt;
c8cdaf99 1456 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1da177e4
LT
1457 struct flowi *fl = &inet->cork.fl;
1458 unsigned char proto = fl->proto;
1459 int err = 0;
1460
1461 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1462 goto out;
1463 tail_skb = &(skb_shinfo(skb)->frag_list);
1464
1465 /* move skb->data to ip header from ext header */
d56f90a7 1466 if (skb->data < skb_network_header(skb))
bbe735e4 1467 __skb_pull(skb, skb_network_offset(skb));
1da177e4 1468 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
cfe1fc77 1469 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1470 *tail_skb = tmp_skb;
1471 tail_skb = &(tmp_skb->next);
1472 skb->len += tmp_skb->len;
1473 skb->data_len += tmp_skb->len;
1da177e4 1474 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1475 tmp_skb->destructor = NULL;
1476 tmp_skb->sk = NULL;
1da177e4
LT
1477 }
1478
28a89453 1479 /* Allow local fragmentation. */
b5c15fc0 1480 if (np->pmtudisc < IPV6_PMTUDISC_DO)
28a89453
HX
1481 skb->local_df = 1;
1482
1da177e4 1483 ipv6_addr_copy(final_dst, &fl->fl6_dst);
cfe1fc77 1484 __skb_pull(skb, skb_network_header_len(skb));
1da177e4
LT
1485 if (opt && opt->opt_flen)
1486 ipv6_push_frag_opts(skb, opt, &proto);
1487 if (opt && opt->opt_nflen)
1488 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1489
e2d1bca7
ACM
1490 skb_push(skb, sizeof(struct ipv6hdr));
1491 skb_reset_network_header(skb);
0660e03f 1492 hdr = ipv6_hdr(skb);
1ab1457c 1493
90bcaf7b 1494 *(__be32*)hdr = fl->fl6_flowlabel |
41a1f8ea 1495 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1da177e4 1496
1da177e4
LT
1497 hdr->hop_limit = np->cork.hop_limit;
1498 hdr->nexthdr = proto;
1499 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1500 ipv6_addr_copy(&hdr->daddr, final_dst);
1501
a2c2064f 1502 skb->priority = sk->sk_priority;
4a19ec58 1503 skb->mark = sk->sk_mark;
a2c2064f 1504
adf30907 1505 skb_dst_set(skb, dst_clone(&rt->u.dst));
edf391ff 1506 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
14878f75 1507 if (proto == IPPROTO_ICMPV6) {
adf30907 1508 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
14878f75 1509
5a57d4c7 1510 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
e41b5368 1511 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
14878f75
DS
1512 }
1513
ef76bc23 1514 err = ip6_local_out(skb);
1da177e4
LT
1515 if (err) {
1516 if (err > 0)
6ce9e7b5 1517 err = net_xmit_errno(err);
1da177e4
LT
1518 if (err)
1519 goto error;
1520 }
1521
1522out:
bf138862 1523 ip6_cork_release(inet, np);
1da177e4
LT
1524 return err;
1525error:
06254914 1526 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1527 goto out;
1528}
1529
1530void ip6_flush_pending_frames(struct sock *sk)
1531{
1da177e4
LT
1532 struct sk_buff *skb;
1533
1534 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
adf30907
ED
1535 if (skb_dst(skb))
1536 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
e1f52208 1537 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1538 kfree_skb(skb);
1539 }
1540
bf138862 1541 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1da177e4 1542}