]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv6/ip6_output.c
ipv6: fix the comment of ip6_xmit()
[net-next-2.6.git] / net / ipv6 / ip6_output.c
CommitLineData
1da177e4
LT
1/*
2 * IPv6 output functions
1ab1457c 3 * Linux INET6 implementation
1da177e4
LT
4 *
5 * Authors:
1ab1457c 6 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 7 *
1da177e4
LT
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
1da177e4 29#include <linux/errno.h>
ef76bc23 30#include <linux/kernel.h>
1da177e4
LT
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
b59f45d0 39#include <linux/module.h>
5a0e3ad6 40#include <linux/slab.h>
1da177e4
LT
41
42#include <linux/netfilter.h>
43#include <linux/netfilter_ipv6.h>
44
45#include <net/sock.h>
46#include <net/snmp.h>
47
48#include <net/ipv6.h>
49#include <net/ndisc.h>
50#include <net/protocol.h>
51#include <net/ip6_route.h>
52#include <net/addrconf.h>
53#include <net/rawv6.h>
54#include <net/icmp.h>
55#include <net/xfrm.h>
56#include <net/checksum.h>
7bc570c8 57#include <linux/mroute6.h>
1da177e4
LT
58
59static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
ef76bc23
HX
61int __ip6_local_out(struct sk_buff *skb)
62{
63 int len;
64
65 len = skb->len - sizeof(struct ipv6hdr);
66 if (len > IPV6_MAXPLEN)
67 len = 0;
68 ipv6_hdr(skb)->payload_len = htons(len);
69
adf30907 70 return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb_dst(skb)->dev,
ef76bc23
HX
71 dst_output);
72}
73
74int ip6_local_out(struct sk_buff *skb)
75{
76 int err;
77
78 err = __ip6_local_out(skb);
79 if (likely(err == 1))
80 err = dst_output(skb);
81
82 return err;
83}
84EXPORT_SYMBOL_GPL(ip6_local_out);
85
ad643a79 86static int ip6_output_finish(struct sk_buff *skb)
1da177e4 87{
adf30907 88 struct dst_entry *dst = skb_dst(skb);
1da177e4 89
3644f0ce
SH
90 if (dst->hh)
91 return neigh_hh_output(dst->hh, skb);
92 else if (dst->neighbour)
1da177e4
LT
93 return dst->neighbour->output(skb);
94
483a47d2
DL
95 IP6_INC_STATS_BH(dev_net(dst->dev),
96 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
97 kfree_skb(skb);
98 return -EINVAL;
99
100}
101
102/* dev_loopback_xmit for use with netfilter. */
103static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
104{
459a98ed 105 skb_reset_mac_header(newskb);
bbe735e4 106 __skb_pull(newskb, skb_network_offset(newskb));
1da177e4
LT
107 newskb->pkt_type = PACKET_LOOPBACK;
108 newskb->ip_summed = CHECKSUM_UNNECESSARY;
adf30907 109 WARN_ON(!skb_dst(newskb));
1da177e4
LT
110
111 netif_rx(newskb);
112 return 0;
113}
114
115
116static int ip6_output2(struct sk_buff *skb)
117{
adf30907 118 struct dst_entry *dst = skb_dst(skb);
1da177e4
LT
119 struct net_device *dev = dst->dev;
120
121 skb->protocol = htons(ETH_P_IPV6);
122 skb->dev = dev;
123
0660e03f 124 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
adf30907 125 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1da177e4 126
7ad6848c 127 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
bd91b8bf
BT
128 ((mroute6_socket(dev_net(dev)) &&
129 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
7bc570c8
YH
130 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
131 &ipv6_hdr(skb)->saddr))) {
1da177e4
LT
132 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
133
134 /* Do not check for IFF_ALLMULTI; multicast routing
135 is not supported in any case.
136 */
137 if (newskb)
6e23ae2a
PM
138 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
139 NULL, newskb->dev,
1da177e4
LT
140 ip6_dev_loopback_xmit);
141
0660e03f 142 if (ipv6_hdr(skb)->hop_limit == 0) {
3bd653c8
DL
143 IP6_INC_STATS(dev_net(dev), idev,
144 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
145 kfree_skb(skb);
146 return 0;
147 }
148 }
149
edf391ff
NH
150 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
151 skb->len);
1da177e4
LT
152 }
153
6e23ae2a
PM
154 return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
155 ip6_output_finish);
1da177e4
LT
156}
157
628a5c56
JH
158static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
159{
160 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
161
162 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
adf30907 163 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
628a5c56
JH
164}
165
1da177e4
LT
166int ip6_output(struct sk_buff *skb)
167{
adf30907 168 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
778d80be 169 if (unlikely(idev->cnf.disable_ipv6)) {
adf30907 170 IP6_INC_STATS(dev_net(skb_dst(skb)->dev), idev,
3bd653c8 171 IPSTATS_MIB_OUTDISCARDS);
778d80be
YH
172 kfree_skb(skb);
173 return 0;
174 }
175
628a5c56 176 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
adf30907 177 dst_allfrag(skb_dst(skb)))
1da177e4
LT
178 return ip6_fragment(skb, ip6_output2);
179 else
180 return ip6_output2(skb);
181}
182
1da177e4 183/*
b5d43998 184 * xmit an sk_buff (used by TCP, SCTP and DCCP)
1da177e4
LT
185 */
186
187int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
4e15ed4d 188 struct ipv6_txoptions *opt)
1da177e4 189{
3bd653c8 190 struct net *net = sock_net(sk);
b30bd282 191 struct ipv6_pinfo *np = inet6_sk(sk);
1da177e4 192 struct in6_addr *first_hop = &fl->fl6_dst;
adf30907 193 struct dst_entry *dst = skb_dst(skb);
1da177e4
LT
194 struct ipv6hdr *hdr;
195 u8 proto = fl->proto;
196 int seg_len = skb->len;
e651f03a
GR
197 int hlimit = -1;
198 int tclass = 0;
1da177e4
LT
199 u32 mtu;
200
201 if (opt) {
c2636b4d 202 unsigned int head_room;
1da177e4
LT
203
204 /* First: exthdrs may take lots of space (~8K for now)
205 MAX_HEADER is not enough.
206 */
207 head_room = opt->opt_nflen + opt->opt_flen;
208 seg_len += head_room;
209 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
210
211 if (skb_headroom(skb) < head_room) {
212 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
a11d206d 213 if (skb2 == NULL) {
adf30907 214 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d
YH
215 IPSTATS_MIB_OUTDISCARDS);
216 kfree_skb(skb);
1da177e4
LT
217 return -ENOBUFS;
218 }
a11d206d
YH
219 kfree_skb(skb);
220 skb = skb2;
1da177e4
LT
221 if (sk)
222 skb_set_owner_w(skb, sk);
223 }
224 if (opt->opt_flen)
225 ipv6_push_frag_opts(skb, opt, &proto);
226 if (opt->opt_nflen)
227 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
228 }
229
e2d1bca7
ACM
230 skb_push(skb, sizeof(struct ipv6hdr));
231 skb_reset_network_header(skb);
0660e03f 232 hdr = ipv6_hdr(skb);
1da177e4
LT
233
234 /*
235 * Fill in the IPv6 header
236 */
e651f03a
GR
237 if (np) {
238 tclass = np->tclass;
1da177e4 239 hlimit = np->hop_limit;
e651f03a 240 }
1da177e4 241 if (hlimit < 0)
6b75d090 242 hlimit = ip6_dst_hoplimit(dst);
1da177e4 243
90bcaf7b 244 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
41a1f8ea 245
1da177e4
LT
246 hdr->payload_len = htons(seg_len);
247 hdr->nexthdr = proto;
248 hdr->hop_limit = hlimit;
249
250 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
251 ipv6_addr_copy(&hdr->daddr, first_hop);
252
a2c2064f 253 skb->priority = sk->sk_priority;
4a19ec58 254 skb->mark = sk->sk_mark;
a2c2064f 255
1da177e4 256 mtu = dst_mtu(dst);
283d07ac 257 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
adf30907 258 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
edf391ff 259 IPSTATS_MIB_OUT, skb->len);
6e23ae2a 260 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
6869c4d8 261 dst_output);
1da177e4
LT
262 }
263
264 if (net_ratelimit())
265 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
266 skb->dev = dst->dev;
3ffe533c 267 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
adf30907 268 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
269 kfree_skb(skb);
270 return -EMSGSIZE;
271}
272
7159039a
YH
273EXPORT_SYMBOL(ip6_xmit);
274
1da177e4
LT
275/*
276 * To avoid extra problems ND packets are send through this
277 * routine. It's code duplication but I really want to avoid
278 * extra checks since ipv6_build_header is used by TCP (which
279 * is for us performance critical)
280 */
281
282int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
9acd9f3a 283 const struct in6_addr *saddr, const struct in6_addr *daddr,
1da177e4
LT
284 int proto, int len)
285{
286 struct ipv6_pinfo *np = inet6_sk(sk);
287 struct ipv6hdr *hdr;
288 int totlen;
289
290 skb->protocol = htons(ETH_P_IPV6);
291 skb->dev = dev;
292
293 totlen = len + sizeof(struct ipv6hdr);
294
55f79cc0
ACM
295 skb_reset_network_header(skb);
296 skb_put(skb, sizeof(struct ipv6hdr));
0660e03f 297 hdr = ipv6_hdr(skb);
1da177e4 298
ae08e1f0 299 *(__be32*)hdr = htonl(0x60000000);
1da177e4
LT
300
301 hdr->payload_len = htons(len);
302 hdr->nexthdr = proto;
303 hdr->hop_limit = np->hop_limit;
304
305 ipv6_addr_copy(&hdr->saddr, saddr);
306 ipv6_addr_copy(&hdr->daddr, daddr);
307
308 return 0;
309}
310
311static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
312{
313 struct ip6_ra_chain *ra;
314 struct sock *last = NULL;
315
316 read_lock(&ip6_ra_lock);
317 for (ra = ip6_ra_chain; ra; ra = ra->next) {
318 struct sock *sk = ra->sk;
0bd1b59b
AM
319 if (sk && ra->sel == sel &&
320 (!sk->sk_bound_dev_if ||
321 sk->sk_bound_dev_if == skb->dev->ifindex)) {
1da177e4
LT
322 if (last) {
323 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
324 if (skb2)
325 rawv6_rcv(last, skb2);
326 }
327 last = sk;
328 }
329 }
330
331 if (last) {
332 rawv6_rcv(last, skb);
333 read_unlock(&ip6_ra_lock);
334 return 1;
335 }
336 read_unlock(&ip6_ra_lock);
337 return 0;
338}
339
e21e0b5f
VN
340static int ip6_forward_proxy_check(struct sk_buff *skb)
341{
0660e03f 342 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f
VN
343 u8 nexthdr = hdr->nexthdr;
344 int offset;
345
346 if (ipv6_ext_hdr(nexthdr)) {
347 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
348 if (offset < 0)
349 return 0;
350 } else
351 offset = sizeof(struct ipv6hdr);
352
353 if (nexthdr == IPPROTO_ICMPV6) {
354 struct icmp6hdr *icmp6;
355
d56f90a7
ACM
356 if (!pskb_may_pull(skb, (skb_network_header(skb) +
357 offset + 1 - skb->data)))
e21e0b5f
VN
358 return 0;
359
d56f90a7 360 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
361
362 switch (icmp6->icmp6_type) {
363 case NDISC_ROUTER_SOLICITATION:
364 case NDISC_ROUTER_ADVERTISEMENT:
365 case NDISC_NEIGHBOUR_SOLICITATION:
366 case NDISC_NEIGHBOUR_ADVERTISEMENT:
367 case NDISC_REDIRECT:
368 /* For reaction involving unicast neighbor discovery
369 * message destined to the proxied address, pass it to
370 * input function.
371 */
372 return 1;
373 default:
374 break;
375 }
376 }
377
74553b09
VN
378 /*
379 * The proxying router can't forward traffic sent to a link-local
380 * address, so signal the sender and discard the packet. This
381 * behavior is clarified by the MIPv6 specification.
382 */
383 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
384 dst_link_failure(skb);
385 return -1;
386 }
387
e21e0b5f
VN
388 return 0;
389}
390
1da177e4
LT
391static inline int ip6_forward_finish(struct sk_buff *skb)
392{
393 return dst_output(skb);
394}
395
396int ip6_forward(struct sk_buff *skb)
397{
adf30907 398 struct dst_entry *dst = skb_dst(skb);
0660e03f 399 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 400 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 401 struct net *net = dev_net(dst->dev);
14f3ad6f 402 u32 mtu;
1ab1457c 403
53b7997f 404 if (net->ipv6.devconf_all->forwarding == 0)
1da177e4
LT
405 goto error;
406
4497b076
BH
407 if (skb_warn_if_lro(skb))
408 goto drop;
409
1da177e4 410 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
3bd653c8 411 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
1da177e4
LT
412 goto drop;
413 }
414
35fc92a9 415 skb_forward_csum(skb);
1da177e4
LT
416
417 /*
418 * We DO NOT make any processing on
419 * RA packets, pushing them to user level AS IS
420 * without ane WARRANTY that application will be able
421 * to interpret them. The reason is that we
422 * cannot make anything clever here.
423 *
424 * We are not end-node, so that if packet contains
425 * AH/ESP, we cannot make anything.
426 * Defragmentation also would be mistake, RA packets
427 * cannot be fragmented, because there is no warranty
428 * that different fragments will go along one path. --ANK
429 */
430 if (opt->ra) {
d56f90a7 431 u8 *ptr = skb_network_header(skb) + opt->ra;
1da177e4
LT
432 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
433 return 0;
434 }
435
436 /*
437 * check and decrement ttl
438 */
439 if (hdr->hop_limit <= 1) {
440 /* Force OUTPUT device used as source address */
441 skb->dev = dst->dev;
3ffe533c 442 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
483a47d2
DL
443 IP6_INC_STATS_BH(net,
444 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
1da177e4
LT
445
446 kfree_skb(skb);
447 return -ETIMEDOUT;
448 }
449
fbea49e1 450 /* XXX: idev->cnf.proxy_ndp? */
53b7997f 451 if (net->ipv6.devconf_all->proxy_ndp &&
8a3edd80 452 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09
VN
453 int proxied = ip6_forward_proxy_check(skb);
454 if (proxied > 0)
e21e0b5f 455 return ip6_input(skb);
74553b09 456 else if (proxied < 0) {
3bd653c8
DL
457 IP6_INC_STATS(net, ip6_dst_idev(dst),
458 IPSTATS_MIB_INDISCARDS);
74553b09
VN
459 goto drop;
460 }
e21e0b5f
VN
461 }
462
1da177e4 463 if (!xfrm6_route_forward(skb)) {
3bd653c8 464 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
1da177e4
LT
465 goto drop;
466 }
adf30907 467 dst = skb_dst(skb);
1da177e4
LT
468
469 /* IPv6 specs say nothing about it, but it is clear that we cannot
470 send redirects to source routed frames.
1e5dc146 471 We don't send redirects to frames decapsulated from IPsec.
1da177e4 472 */
1e5dc146 473 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
def8b4fa 474 !skb_sec_path(skb)) {
1da177e4
LT
475 struct in6_addr *target = NULL;
476 struct rt6_info *rt;
477 struct neighbour *n = dst->neighbour;
478
479 /*
480 * incoming and outgoing devices are the same
481 * send a redirect.
482 */
483
484 rt = (struct rt6_info *) dst;
485 if ((rt->rt6i_flags & RTF_GATEWAY))
486 target = (struct in6_addr*)&n->primary_key;
487 else
488 target = &hdr->daddr;
489
490 /* Limit redirects both by destination (here)
491 and by source (inside ndisc_send_redirect)
492 */
493 if (xrlim_allow(dst, 1*HZ))
494 ndisc_send_redirect(skb, n, target);
5bb1ab09
DS
495 } else {
496 int addrtype = ipv6_addr_type(&hdr->saddr);
497
1da177e4 498 /* This check is security critical. */
f81b2e7d
YH
499 if (addrtype == IPV6_ADDR_ANY ||
500 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
501 goto error;
502 if (addrtype & IPV6_ADDR_LINKLOCAL) {
503 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
3ffe533c 504 ICMPV6_NOT_NEIGHBOUR, 0);
5bb1ab09
DS
505 goto error;
506 }
1da177e4
LT
507 }
508
14f3ad6f
UW
509 mtu = dst_mtu(dst);
510 if (mtu < IPV6_MIN_MTU)
511 mtu = IPV6_MIN_MTU;
512
513 if (skb->len > mtu) {
1da177e4
LT
514 /* Again, force OUTPUT device used as source address */
515 skb->dev = dst->dev;
14f3ad6f 516 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
483a47d2
DL
517 IP6_INC_STATS_BH(net,
518 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
519 IP6_INC_STATS_BH(net,
520 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
521 kfree_skb(skb);
522 return -EMSGSIZE;
523 }
524
525 if (skb_cow(skb, dst->dev->hard_header_len)) {
3bd653c8 526 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
527 goto drop;
528 }
529
0660e03f 530 hdr = ipv6_hdr(skb);
1da177e4
LT
531
532 /* Mangling hops number delayed to point after skb COW */
1ab1457c 533
1da177e4
LT
534 hdr->hop_limit--;
535
483a47d2 536 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
6e23ae2a
PM
537 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
538 ip6_forward_finish);
1da177e4
LT
539
540error:
483a47d2 541 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
1da177e4
LT
542drop:
543 kfree_skb(skb);
544 return -EINVAL;
545}
546
547static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
548{
549 to->pkt_type = from->pkt_type;
550 to->priority = from->priority;
551 to->protocol = from->protocol;
adf30907
ED
552 skb_dst_drop(to);
553 skb_dst_set(to, dst_clone(skb_dst(from)));
1da177e4 554 to->dev = from->dev;
82e91ffe 555 to->mark = from->mark;
1da177e4
LT
556
557#ifdef CONFIG_NET_SCHED
558 to->tc_index = from->tc_index;
559#endif
e7ac05f3 560 nf_copy(to, from);
ba9dda3a
JK
561#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
562 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
563 to->nf_trace = from->nf_trace;
564#endif
984bc16c 565 skb_copy_secmark(to, from);
1da177e4
LT
566}
567
568int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
569{
570 u16 offset = sizeof(struct ipv6hdr);
0660e03f
ACM
571 struct ipv6_opt_hdr *exthdr =
572 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
27a884dc 573 unsigned int packet_len = skb->tail - skb->network_header;
1da177e4 574 int found_rhdr = 0;
0660e03f 575 *nexthdr = &ipv6_hdr(skb)->nexthdr;
1da177e4
LT
576
577 while (offset + 1 <= packet_len) {
578
579 switch (**nexthdr) {
580
581 case NEXTHDR_HOP:
27637df9 582 break;
1da177e4 583 case NEXTHDR_ROUTING:
27637df9
MN
584 found_rhdr = 1;
585 break;
1da177e4 586 case NEXTHDR_DEST:
59fbb3a6 587#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
27637df9
MN
588 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
589 break;
590#endif
591 if (found_rhdr)
592 return offset;
1da177e4
LT
593 break;
594 default :
595 return offset;
596 }
27637df9
MN
597
598 offset += ipv6_optlen(exthdr);
599 *nexthdr = &exthdr->nexthdr;
d56f90a7
ACM
600 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
601 offset);
1da177e4
LT
602 }
603
604 return offset;
605}
606
607static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
608{
1da177e4 609 struct sk_buff *frag;
adf30907 610 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
d91675f9 611 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
1da177e4
LT
612 struct ipv6hdr *tmp_hdr;
613 struct frag_hdr *fh;
614 unsigned int mtu, hlen, left, len;
ae08e1f0 615 __be32 frag_id = 0;
1da177e4
LT
616 int ptr, offset = 0, err=0;
617 u8 *prevhdr, nexthdr = 0;
adf30907 618 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4 619
1da177e4
LT
620 hlen = ip6_find_1stfragopt(skb, &prevhdr);
621 nexthdr = *prevhdr;
622
628a5c56 623 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
624
625 /* We must not fragment if the socket is set to force MTU discovery
14f3ad6f 626 * or if the skb it not generated by a local socket.
b881ef76 627 */
b5c15fc0 628 if (!skb->local_df) {
adf30907 629 skb->dev = skb_dst(skb)->dev;
3ffe533c 630 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
adf30907 631 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 632 IPSTATS_MIB_FRAGFAILS);
b881ef76
JH
633 kfree_skb(skb);
634 return -EMSGSIZE;
635 }
636
d91675f9
YH
637 if (np && np->frag_size < mtu) {
638 if (np->frag_size)
639 mtu = np->frag_size;
640 }
641 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4 642
4d9092bb 643 if (skb_has_frags(skb)) {
1da177e4 644 int first_len = skb_pagelen(skb);
29ffe1a5 645 int truesizes = 0;
1da177e4
LT
646
647 if (first_len - hlen > mtu ||
648 ((first_len - hlen) & 7) ||
649 skb_cloned(skb))
650 goto slow_path;
651
4d9092bb 652 skb_walk_frags(skb, frag) {
1da177e4
LT
653 /* Correct geometry. */
654 if (frag->len > mtu ||
655 ((frag->len & 7) && frag->next) ||
656 skb_headroom(frag) < hlen)
657 goto slow_path;
658
1da177e4
LT
659 /* Partially cloned skb? */
660 if (skb_shared(frag))
661 goto slow_path;
2fdba6b0
HX
662
663 BUG_ON(frag->sk);
664 if (skb->sk) {
2fdba6b0
HX
665 frag->sk = skb->sk;
666 frag->destructor = sock_wfree;
29ffe1a5 667 truesizes += frag->truesize;
2fdba6b0 668 }
1da177e4
LT
669 }
670
671 err = 0;
672 offset = 0;
673 frag = skb_shinfo(skb)->frag_list;
4d9092bb 674 skb_frag_list_init(skb);
1da177e4
LT
675 /* BUILD HEADER */
676
9a217a1c 677 *prevhdr = NEXTHDR_FRAGMENT;
d56f90a7 678 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
1da177e4 679 if (!tmp_hdr) {
adf30907 680 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 681 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
682 return -ENOMEM;
683 }
684
1da177e4
LT
685 __skb_pull(skb, hlen);
686 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
e2d1bca7
ACM
687 __skb_push(skb, hlen);
688 skb_reset_network_header(skb);
d56f90a7 689 memcpy(skb_network_header(skb), tmp_hdr, hlen);
1da177e4 690
7ea2f2c5 691 ipv6_select_ident(fh);
1da177e4
LT
692 fh->nexthdr = nexthdr;
693 fh->reserved = 0;
694 fh->frag_off = htons(IP6_MF);
695 frag_id = fh->identification;
696
697 first_len = skb_pagelen(skb);
698 skb->data_len = first_len - skb_headlen(skb);
29ffe1a5 699 skb->truesize -= truesizes;
1da177e4 700 skb->len = first_len;
0660e03f
ACM
701 ipv6_hdr(skb)->payload_len = htons(first_len -
702 sizeof(struct ipv6hdr));
a11d206d
YH
703
704 dst_hold(&rt->u.dst);
1da177e4
LT
705
706 for (;;) {
707 /* Prepare header of the next frame,
708 * before previous one went down. */
709 if (frag) {
710 frag->ip_summed = CHECKSUM_NONE;
badff6d0 711 skb_reset_transport_header(frag);
1da177e4 712 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
e2d1bca7
ACM
713 __skb_push(frag, hlen);
714 skb_reset_network_header(frag);
d56f90a7
ACM
715 memcpy(skb_network_header(frag), tmp_hdr,
716 hlen);
1da177e4
LT
717 offset += skb->len - hlen - sizeof(struct frag_hdr);
718 fh->nexthdr = nexthdr;
719 fh->reserved = 0;
720 fh->frag_off = htons(offset);
721 if (frag->next != NULL)
722 fh->frag_off |= htons(IP6_MF);
723 fh->identification = frag_id;
0660e03f
ACM
724 ipv6_hdr(frag)->payload_len =
725 htons(frag->len -
726 sizeof(struct ipv6hdr));
1da177e4
LT
727 ip6_copy_metadata(frag, skb);
728 }
1ab1457c 729
1da177e4 730 err = output(skb);
dafee490 731 if(!err)
3bd653c8
DL
732 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
733 IPSTATS_MIB_FRAGCREATES);
dafee490 734
1da177e4
LT
735 if (err || !frag)
736 break;
737
738 skb = frag;
739 frag = skb->next;
740 skb->next = NULL;
741 }
742
a51482bd 743 kfree(tmp_hdr);
1da177e4
LT
744
745 if (err == 0) {
3bd653c8
DL
746 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
747 IPSTATS_MIB_FRAGOKS);
a11d206d 748 dst_release(&rt->u.dst);
1da177e4
LT
749 return 0;
750 }
751
752 while (frag) {
753 skb = frag->next;
754 kfree_skb(frag);
755 frag = skb;
756 }
757
3bd653c8
DL
758 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
759 IPSTATS_MIB_FRAGFAILS);
a11d206d 760 dst_release(&rt->u.dst);
1da177e4
LT
761 return err;
762 }
763
764slow_path:
765 left = skb->len - hlen; /* Space per frame */
766 ptr = hlen; /* Where to start from */
767
768 /*
769 * Fragment the datagram.
770 */
771
772 *prevhdr = NEXTHDR_FRAGMENT;
773
774 /*
775 * Keep copying data until we run out.
776 */
777 while(left > 0) {
778 len = left;
779 /* IF: it doesn't fit, use 'mtu' - the data space left */
780 if (len > mtu)
781 len = mtu;
782 /* IF: we are not sending upto and including the packet end
783 then align the next start on an eight byte boundary */
784 if (len < left) {
785 len &= ~7;
786 }
787 /*
788 * Allocate buffer.
789 */
790
f5184d26 791 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
64ce2073 792 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
adf30907 793 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 794 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
795 err = -ENOMEM;
796 goto fail;
797 }
798
799 /*
800 * Set up data on packet
801 */
802
803 ip6_copy_metadata(frag, skb);
804 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
805 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
c1d2bbe1 806 skb_reset_network_header(frag);
badff6d0 807 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
b0e380b1
ACM
808 frag->transport_header = (frag->network_header + hlen +
809 sizeof(struct frag_hdr));
1da177e4
LT
810
811 /*
812 * Charge the memory for the fragment to any owner
813 * it might possess
814 */
815 if (skb->sk)
816 skb_set_owner_w(frag, skb->sk);
817
818 /*
819 * Copy the packet header into the new buffer.
820 */
d626f62b 821 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
1da177e4
LT
822
823 /*
824 * Build fragment header.
825 */
826 fh->nexthdr = nexthdr;
827 fh->reserved = 0;
f36d6ab1 828 if (!frag_id) {
7ea2f2c5 829 ipv6_select_ident(fh);
1da177e4
LT
830 frag_id = fh->identification;
831 } else
832 fh->identification = frag_id;
833
834 /*
835 * Copy a block of the IP datagram.
836 */
8984e41d 837 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
1da177e4
LT
838 BUG();
839 left -= len;
840
841 fh->frag_off = htons(offset);
842 if (left > 0)
843 fh->frag_off |= htons(IP6_MF);
0660e03f
ACM
844 ipv6_hdr(frag)->payload_len = htons(frag->len -
845 sizeof(struct ipv6hdr));
1da177e4
LT
846
847 ptr += len;
848 offset += len;
849
850 /*
851 * Put this fragment into the sending queue.
852 */
1da177e4
LT
853 err = output(frag);
854 if (err)
855 goto fail;
dafee490 856
adf30907 857 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 858 IPSTATS_MIB_FRAGCREATES);
1da177e4 859 }
adf30907 860 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 861 IPSTATS_MIB_FRAGOKS);
1da177e4 862 kfree_skb(skb);
1da177e4
LT
863 return err;
864
865fail:
adf30907 866 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 867 IPSTATS_MIB_FRAGFAILS);
1ab1457c 868 kfree_skb(skb);
1da177e4
LT
869 return err;
870}
871
cf6b1982
YH
872static inline int ip6_rt_check(struct rt6key *rt_key,
873 struct in6_addr *fl_addr,
874 struct in6_addr *addr_cache)
875{
876 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
877 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
878}
879
497c615a
HX
880static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
881 struct dst_entry *dst,
882 struct flowi *fl)
1da177e4 883{
497c615a
HX
884 struct ipv6_pinfo *np = inet6_sk(sk);
885 struct rt6_info *rt = (struct rt6_info *)dst;
1da177e4 886
497c615a
HX
887 if (!dst)
888 goto out;
889
890 /* Yes, checking route validity in not connected
891 * case is not very simple. Take into account,
892 * that we do not support routing by source, TOS,
893 * and MSG_DONTROUTE --ANK (980726)
894 *
cf6b1982
YH
895 * 1. ip6_rt_check(): If route was host route,
896 * check that cached destination is current.
497c615a
HX
897 * If it is network route, we still may
898 * check its validity using saved pointer
899 * to the last used address: daddr_cache.
900 * We do not want to save whole address now,
901 * (because main consumer of this service
902 * is tcp, which has not this problem),
903 * so that the last trick works only on connected
904 * sockets.
905 * 2. oif also should be the same.
906 */
cf6b1982 907 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
8e1ef0a9
YH
908#ifdef CONFIG_IPV6_SUBTREES
909 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
910#endif
cf6b1982 911 (fl->oif && fl->oif != dst->dev->ifindex)) {
497c615a
HX
912 dst_release(dst);
913 dst = NULL;
1da177e4
LT
914 }
915
497c615a
HX
916out:
917 return dst;
918}
919
920static int ip6_dst_lookup_tail(struct sock *sk,
921 struct dst_entry **dst, struct flowi *fl)
922{
923 int err;
3b1e0a65 924 struct net *net = sock_net(sk);
497c615a 925
1da177e4 926 if (*dst == NULL)
8a3edd80 927 *dst = ip6_route_output(net, sk, fl);
1da177e4
LT
928
929 if ((err = (*dst)->error))
930 goto out_err_release;
931
932 if (ipv6_addr_any(&fl->fl6_src)) {
191cd582 933 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
7cbca67c
YH
934 &fl->fl6_dst,
935 sk ? inet6_sk(sk)->srcprefs : 0,
936 &fl->fl6_src);
44456d37 937 if (err)
1da177e4 938 goto out_err_release;
1da177e4
LT
939 }
940
95c385b4 941#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
e550dfb0
NH
942 /*
943 * Here if the dst entry we've looked up
944 * has a neighbour entry that is in the INCOMPLETE
945 * state and the src address from the flow is
946 * marked as OPTIMISTIC, we release the found
947 * dst entry and replace it instead with the
948 * dst entry of the nexthop router
949 */
950 if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
951 struct inet6_ifaddr *ifp;
952 struct flowi fl_gw;
953 int redirect;
954
955 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
956 (*dst)->dev, 1);
957
958 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
959 if (ifp)
960 in6_ifa_put(ifp);
961
962 if (redirect) {
963 /*
964 * We need to get the dst entry for the
965 * default router instead
966 */
967 dst_release(*dst);
968 memcpy(&fl_gw, fl, sizeof(struct flowi));
969 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
970 *dst = ip6_route_output(net, sk, &fl_gw);
971 if ((err = (*dst)->error))
972 goto out_err_release;
95c385b4 973 }
e550dfb0 974 }
95c385b4
NH
975#endif
976
1da177e4
LT
977 return 0;
978
979out_err_release:
ca46f9c8 980 if (err == -ENETUNREACH)
483a47d2 981 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
982 dst_release(*dst);
983 *dst = NULL;
984 return err;
985}
34a0b3cd 986
497c615a
HX
987/**
988 * ip6_dst_lookup - perform route lookup on flow
989 * @sk: socket which provides route info
990 * @dst: pointer to dst_entry * for result
991 * @fl: flow to lookup
992 *
993 * This function performs a route lookup on the given flow.
994 *
995 * It returns zero on success, or a standard errno code on error.
996 */
997int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
998{
999 *dst = NULL;
1000 return ip6_dst_lookup_tail(sk, dst, fl);
1001}
3cf3dc6c
ACM
1002EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1003
497c615a
HX
1004/**
1005 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
1006 * @sk: socket which provides the dst cache and route info
1007 * @dst: pointer to dst_entry * for result
1008 * @fl: flow to lookup
1009 *
1010 * This function performs a route lookup on the given flow with the
1011 * possibility of using the cached route in the socket if it is valid.
1012 * It will take the socket dst lock when operating on the dst cache.
1013 * As a result, this function can only be used in process context.
1014 *
1015 * It returns zero on success, or a standard errno code on error.
1016 */
1017int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1018{
1019 *dst = NULL;
1020 if (sk) {
1021 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1022 *dst = ip6_sk_dst_check(sk, *dst, fl);
1023 }
1024
1025 return ip6_dst_lookup_tail(sk, dst, fl);
1026}
1027EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1028
34a0b3cd 1029static inline int ip6_ufo_append_data(struct sock *sk,
e89e9cf5
AR
1030 int getfrag(void *from, char *to, int offset, int len,
1031 int odd, struct sk_buff *skb),
1032 void *from, int length, int hh_len, int fragheaderlen,
1033 int transhdrlen, int mtu,unsigned int flags)
1034
1035{
1036 struct sk_buff *skb;
1037 int err;
1038
1039 /* There is support for UDP large send offload by network
1040 * device, so create one single skb packet containing complete
1041 * udp datagram
1042 */
1043 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1044 skb = sock_alloc_send_skb(sk,
1045 hh_len + fragheaderlen + transhdrlen + 20,
1046 (flags & MSG_DONTWAIT), &err);
1047 if (skb == NULL)
1048 return -ENOMEM;
1049
1050 /* reserve space for Hardware header */
1051 skb_reserve(skb, hh_len);
1052
1053 /* create space for UDP/IP header */
1054 skb_put(skb,fragheaderlen + transhdrlen);
1055
1056 /* initialize network header pointer */
c1d2bbe1 1057 skb_reset_network_header(skb);
e89e9cf5
AR
1058
1059 /* initialize protocol header pointer */
b0e380b1 1060 skb->transport_header = skb->network_header + fragheaderlen;
e89e9cf5 1061
84fa7933 1062 skb->ip_summed = CHECKSUM_PARTIAL;
e89e9cf5
AR
1063 skb->csum = 0;
1064 sk->sk_sndmsg_off = 0;
1065 }
1066
1067 err = skb_append_datato_frags(sk,skb, getfrag, from,
1068 (length - transhdrlen));
1069 if (!err) {
1070 struct frag_hdr fhdr;
1071
c31d5326
SS
1072 /* Specify the length of each IPv6 datagram fragment.
1073 * It has to be a multiple of 8.
1074 */
1075 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1076 sizeof(struct frag_hdr)) & ~7;
f83ef8c0 1077 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
7ea2f2c5 1078 ipv6_select_ident(&fhdr);
e89e9cf5
AR
1079 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1080 __skb_queue_tail(&sk->sk_write_queue, skb);
1081
1082 return 0;
1083 }
1084 /* There is not enough support do UPD LSO,
1085 * so follow normal path
1086 */
1087 kfree_skb(skb);
1088
1089 return err;
1090}
1da177e4 1091
0178b695
HX
1092static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1093 gfp_t gfp)
1094{
1095 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1096}
1097
1098static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1099 gfp_t gfp)
1100{
1101 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1102}
1103
41a1f8ea
YH
1104int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1105 int offset, int len, int odd, struct sk_buff *skb),
1106 void *from, int length, int transhdrlen,
1107 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1108 struct rt6_info *rt, unsigned int flags)
1da177e4
LT
1109{
1110 struct inet_sock *inet = inet_sk(sk);
1111 struct ipv6_pinfo *np = inet6_sk(sk);
1112 struct sk_buff *skb;
1113 unsigned int maxfraglen, fragheaderlen;
1114 int exthdrlen;
1115 int hh_len;
1116 int mtu;
1117 int copy;
1118 int err;
1119 int offset = 0;
1120 int csummode = CHECKSUM_NONE;
1121
1122 if (flags&MSG_PROBE)
1123 return 0;
1124 if (skb_queue_empty(&sk->sk_write_queue)) {
1125 /*
1126 * setup for corking
1127 */
1128 if (opt) {
0178b695 1129 if (WARN_ON(np->cork.opt))
1da177e4 1130 return -EINVAL;
0178b695
HX
1131
1132 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1133 if (unlikely(np->cork.opt == NULL))
1134 return -ENOBUFS;
1135
1136 np->cork.opt->tot_len = opt->tot_len;
1137 np->cork.opt->opt_flen = opt->opt_flen;
1138 np->cork.opt->opt_nflen = opt->opt_nflen;
1139
1140 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1141 sk->sk_allocation);
1142 if (opt->dst0opt && !np->cork.opt->dst0opt)
1143 return -ENOBUFS;
1144
1145 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1146 sk->sk_allocation);
1147 if (opt->dst1opt && !np->cork.opt->dst1opt)
1148 return -ENOBUFS;
1149
1150 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1151 sk->sk_allocation);
1152 if (opt->hopopt && !np->cork.opt->hopopt)
1153 return -ENOBUFS;
1154
1155 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1156 sk->sk_allocation);
1157 if (opt->srcrt && !np->cork.opt->srcrt)
1158 return -ENOBUFS;
1159
1da177e4
LT
1160 /* need source address above miyazawa*/
1161 }
1162 dst_hold(&rt->u.dst);
c8cdaf99 1163 inet->cork.dst = &rt->u.dst;
1da177e4
LT
1164 inet->cork.fl = *fl;
1165 np->cork.hop_limit = hlimit;
41a1f8ea 1166 np->cork.tclass = tclass;
628a5c56
JH
1167 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1168 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
c7503609 1169 if (np->frag_size < mtu) {
d91675f9
YH
1170 if (np->frag_size)
1171 mtu = np->frag_size;
1172 }
1173 inet->cork.fragsize = mtu;
1da177e4
LT
1174 if (dst_allfrag(rt->u.dst.path))
1175 inet->cork.flags |= IPCORK_ALLFRAG;
1176 inet->cork.length = 0;
1177 sk->sk_sndmsg_page = NULL;
1178 sk->sk_sndmsg_off = 0;
01488942 1179 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
a1b05140 1180 rt->rt6i_nfheader_len;
1da177e4
LT
1181 length += exthdrlen;
1182 transhdrlen += exthdrlen;
1183 } else {
c8cdaf99 1184 rt = (struct rt6_info *)inet->cork.dst;
1da177e4 1185 fl = &inet->cork.fl;
0178b695 1186 opt = np->cork.opt;
1da177e4
LT
1187 transhdrlen = 0;
1188 exthdrlen = 0;
1189 mtu = inet->cork.fragsize;
1190 }
1191
1192 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1193
a1b05140 1194 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1195 (opt ? opt->opt_nflen : 0);
1da177e4
LT
1196 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1197
1198 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1199 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1200 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1201 return -EMSGSIZE;
1202 }
1203 }
1204
1205 /*
1206 * Let's try using as much space as possible.
1207 * Use MTU if total length of the message fits into the MTU.
1208 * Otherwise, we need to reserve fragment header and
1209 * fragment alignment (= 8-15 octects, in total).
1210 *
1211 * Note that we may need to "move" the data from the tail of
1ab1457c 1212 * of the buffer to the new fragment when we split
1da177e4
LT
1213 * the message.
1214 *
1ab1457c 1215 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1216 * at once if non-fragmentable extension headers
1217 * are too large.
1ab1457c 1218 * --yoshfuji
1da177e4
LT
1219 */
1220
1221 inet->cork.length += length;
e89e9cf5
AR
1222 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1223 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1224
baa829d8
PM
1225 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1226 fragheaderlen, transhdrlen, mtu,
1227 flags);
1228 if (err)
e89e9cf5 1229 goto error;
e89e9cf5
AR
1230 return 0;
1231 }
1da177e4
LT
1232
1233 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1234 goto alloc_new_skb;
1235
1236 while (length > 0) {
1237 /* Check if the remaining data fits into current packet. */
1238 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1239 if (copy < length)
1240 copy = maxfraglen - skb->len;
1241
1242 if (copy <= 0) {
1243 char *data;
1244 unsigned int datalen;
1245 unsigned int fraglen;
1246 unsigned int fraggap;
1247 unsigned int alloclen;
1248 struct sk_buff *skb_prev;
1249alloc_new_skb:
1250 skb_prev = skb;
1251
1252 /* There's no room in the current skb */
1253 if (skb_prev)
1254 fraggap = skb_prev->len - maxfraglen;
1255 else
1256 fraggap = 0;
1257
1258 /*
1259 * If remaining data exceeds the mtu,
1260 * we know we need more fragment(s).
1261 */
1262 datalen = length + fraggap;
1263 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1264 datalen = maxfraglen - fragheaderlen;
1265
1266 fraglen = datalen + fragheaderlen;
1267 if ((flags & MSG_MORE) &&
1268 !(rt->u.dst.dev->features&NETIF_F_SG))
1269 alloclen = mtu;
1270 else
1271 alloclen = datalen + fragheaderlen;
1272
1273 /*
1274 * The last fragment gets additional space at tail.
1275 * Note: we overallocate on fragments with MSG_MODE
1276 * because we have no idea if we're the last one.
1277 */
1278 if (datalen == length + fraggap)
1279 alloclen += rt->u.dst.trailer_len;
1280
1281 /*
1282 * We just reserve space for fragment header.
1ab1457c 1283 * Note: this may be overallocation if the message
1da177e4
LT
1284 * (without MSG_MORE) fits into the MTU.
1285 */
1286 alloclen += sizeof(struct frag_hdr);
1287
1288 if (transhdrlen) {
1289 skb = sock_alloc_send_skb(sk,
1290 alloclen + hh_len,
1291 (flags & MSG_DONTWAIT), &err);
1292 } else {
1293 skb = NULL;
1294 if (atomic_read(&sk->sk_wmem_alloc) <=
1295 2 * sk->sk_sndbuf)
1296 skb = sock_wmalloc(sk,
1297 alloclen + hh_len, 1,
1298 sk->sk_allocation);
1299 if (unlikely(skb == NULL))
1300 err = -ENOBUFS;
1301 }
1302 if (skb == NULL)
1303 goto error;
1304 /*
1305 * Fill in the control structures
1306 */
1307 skb->ip_summed = csummode;
1308 skb->csum = 0;
1309 /* reserve for fragmentation */
1310 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1311
1312 /*
1313 * Find where to start putting bytes
1314 */
1315 data = skb_put(skb, fraglen);
c14d2450 1316 skb_set_network_header(skb, exthdrlen);
1da177e4 1317 data += fragheaderlen;
b0e380b1
ACM
1318 skb->transport_header = (skb->network_header +
1319 fragheaderlen);
1da177e4
LT
1320 if (fraggap) {
1321 skb->csum = skb_copy_and_csum_bits(
1322 skb_prev, maxfraglen,
1323 data + transhdrlen, fraggap, 0);
1324 skb_prev->csum = csum_sub(skb_prev->csum,
1325 skb->csum);
1326 data += fraggap;
e9fa4f7b 1327 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
1328 }
1329 copy = datalen - transhdrlen - fraggap;
1330 if (copy < 0) {
1331 err = -EINVAL;
1332 kfree_skb(skb);
1333 goto error;
1334 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1335 err = -EFAULT;
1336 kfree_skb(skb);
1337 goto error;
1338 }
1339
1340 offset += copy;
1341 length -= datalen - fraggap;
1342 transhdrlen = 0;
1343 exthdrlen = 0;
1344 csummode = CHECKSUM_NONE;
1345
1346 /*
1347 * Put the packet on the pending queue
1348 */
1349 __skb_queue_tail(&sk->sk_write_queue, skb);
1350 continue;
1351 }
1352
1353 if (copy > length)
1354 copy = length;
1355
1356 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1357 unsigned int off;
1358
1359 off = skb->len;
1360 if (getfrag(from, skb_put(skb, copy),
1361 offset, copy, off, skb) < 0) {
1362 __skb_trim(skb, off);
1363 err = -EFAULT;
1364 goto error;
1365 }
1366 } else {
1367 int i = skb_shinfo(skb)->nr_frags;
1368 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1369 struct page *page = sk->sk_sndmsg_page;
1370 int off = sk->sk_sndmsg_off;
1371 unsigned int left;
1372
1373 if (page && (left = PAGE_SIZE - off) > 0) {
1374 if (copy >= left)
1375 copy = left;
1376 if (page != frag->page) {
1377 if (i == MAX_SKB_FRAGS) {
1378 err = -EMSGSIZE;
1379 goto error;
1380 }
1381 get_page(page);
1382 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1383 frag = &skb_shinfo(skb)->frags[i];
1384 }
1385 } else if(i < MAX_SKB_FRAGS) {
1386 if (copy > PAGE_SIZE)
1387 copy = PAGE_SIZE;
1388 page = alloc_pages(sk->sk_allocation, 0);
1389 if (page == NULL) {
1390 err = -ENOMEM;
1391 goto error;
1392 }
1393 sk->sk_sndmsg_page = page;
1394 sk->sk_sndmsg_off = 0;
1395
1396 skb_fill_page_desc(skb, i, page, 0, 0);
1397 frag = &skb_shinfo(skb)->frags[i];
1da177e4
LT
1398 } else {
1399 err = -EMSGSIZE;
1400 goto error;
1401 }
1402 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1403 err = -EFAULT;
1404 goto error;
1405 }
1406 sk->sk_sndmsg_off += copy;
1407 frag->size += copy;
1408 skb->len += copy;
1409 skb->data_len += copy;
f945fa7a
HX
1410 skb->truesize += copy;
1411 atomic_add(copy, &sk->sk_wmem_alloc);
1da177e4
LT
1412 }
1413 offset += copy;
1414 length -= copy;
1415 }
1416 return 0;
1417error:
1418 inet->cork.length -= length;
3bd653c8 1419 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1420 return err;
1421}
1422
bf138862
PE
1423static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1424{
0178b695
HX
1425 if (np->cork.opt) {
1426 kfree(np->cork.opt->dst0opt);
1427 kfree(np->cork.opt->dst1opt);
1428 kfree(np->cork.opt->hopopt);
1429 kfree(np->cork.opt->srcrt);
1430 kfree(np->cork.opt);
1431 np->cork.opt = NULL;
1432 }
1433
c8cdaf99
YH
1434 if (inet->cork.dst) {
1435 dst_release(inet->cork.dst);
1436 inet->cork.dst = NULL;
bf138862
PE
1437 inet->cork.flags &= ~IPCORK_ALLFRAG;
1438 }
1439 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1440}
1441
1da177e4
LT
1442int ip6_push_pending_frames(struct sock *sk)
1443{
1444 struct sk_buff *skb, *tmp_skb;
1445 struct sk_buff **tail_skb;
1446 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1447 struct inet_sock *inet = inet_sk(sk);
1448 struct ipv6_pinfo *np = inet6_sk(sk);
3bd653c8 1449 struct net *net = sock_net(sk);
1da177e4
LT
1450 struct ipv6hdr *hdr;
1451 struct ipv6_txoptions *opt = np->cork.opt;
c8cdaf99 1452 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1da177e4
LT
1453 struct flowi *fl = &inet->cork.fl;
1454 unsigned char proto = fl->proto;
1455 int err = 0;
1456
1457 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1458 goto out;
1459 tail_skb = &(skb_shinfo(skb)->frag_list);
1460
1461 /* move skb->data to ip header from ext header */
d56f90a7 1462 if (skb->data < skb_network_header(skb))
bbe735e4 1463 __skb_pull(skb, skb_network_offset(skb));
1da177e4 1464 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
cfe1fc77 1465 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1466 *tail_skb = tmp_skb;
1467 tail_skb = &(tmp_skb->next);
1468 skb->len += tmp_skb->len;
1469 skb->data_len += tmp_skb->len;
1da177e4 1470 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1471 tmp_skb->destructor = NULL;
1472 tmp_skb->sk = NULL;
1da177e4
LT
1473 }
1474
28a89453 1475 /* Allow local fragmentation. */
b5c15fc0 1476 if (np->pmtudisc < IPV6_PMTUDISC_DO)
28a89453
HX
1477 skb->local_df = 1;
1478
1da177e4 1479 ipv6_addr_copy(final_dst, &fl->fl6_dst);
cfe1fc77 1480 __skb_pull(skb, skb_network_header_len(skb));
1da177e4
LT
1481 if (opt && opt->opt_flen)
1482 ipv6_push_frag_opts(skb, opt, &proto);
1483 if (opt && opt->opt_nflen)
1484 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1485
e2d1bca7
ACM
1486 skb_push(skb, sizeof(struct ipv6hdr));
1487 skb_reset_network_header(skb);
0660e03f 1488 hdr = ipv6_hdr(skb);
1ab1457c 1489
90bcaf7b 1490 *(__be32*)hdr = fl->fl6_flowlabel |
41a1f8ea 1491 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1da177e4 1492
1da177e4
LT
1493 hdr->hop_limit = np->cork.hop_limit;
1494 hdr->nexthdr = proto;
1495 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1496 ipv6_addr_copy(&hdr->daddr, final_dst);
1497
a2c2064f 1498 skb->priority = sk->sk_priority;
4a19ec58 1499 skb->mark = sk->sk_mark;
a2c2064f 1500
adf30907 1501 skb_dst_set(skb, dst_clone(&rt->u.dst));
edf391ff 1502 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
14878f75 1503 if (proto == IPPROTO_ICMPV6) {
adf30907 1504 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
14878f75 1505
5a57d4c7 1506 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
e41b5368 1507 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
14878f75
DS
1508 }
1509
ef76bc23 1510 err = ip6_local_out(skb);
1da177e4
LT
1511 if (err) {
1512 if (err > 0)
6ce9e7b5 1513 err = net_xmit_errno(err);
1da177e4
LT
1514 if (err)
1515 goto error;
1516 }
1517
1518out:
bf138862 1519 ip6_cork_release(inet, np);
1da177e4
LT
1520 return err;
1521error:
06254914 1522 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1523 goto out;
1524}
1525
1526void ip6_flush_pending_frames(struct sock *sk)
1527{
1da177e4
LT
1528 struct sk_buff *skb;
1529
1530 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
adf30907
ED
1531 if (skb_dst(skb))
1532 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
e1f52208 1533 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1534 kfree_skb(skb);
1535 }
1536
bf138862 1537 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1da177e4 1538}