]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv6/ip6_output.c
netfilter: xtables: inclusion of xt_TEE
[net-next-2.6.git] / net / ipv6 / ip6_output.c
CommitLineData
1da177e4
LT
1/*
2 * IPv6 output functions
1ab1457c 3 * Linux INET6 implementation
1da177e4
LT
4 *
5 * Authors:
1ab1457c 6 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 7 *
1da177e4
LT
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
1da177e4 29#include <linux/errno.h>
ef76bc23 30#include <linux/kernel.h>
1da177e4
LT
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
b59f45d0 39#include <linux/module.h>
1da177e4
LT
40
41#include <linux/netfilter.h>
42#include <linux/netfilter_ipv6.h>
43
44#include <net/sock.h>
45#include <net/snmp.h>
46
47#include <net/ipv6.h>
48#include <net/ndisc.h>
49#include <net/protocol.h>
50#include <net/ip6_route.h>
51#include <net/addrconf.h>
52#include <net/rawv6.h>
53#include <net/icmp.h>
54#include <net/xfrm.h>
55#include <net/checksum.h>
7bc570c8 56#include <linux/mroute6.h>
1da177e4
LT
57
58static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59
ef76bc23
HX
60int __ip6_local_out(struct sk_buff *skb)
61{
62 int len;
63
64 len = skb->len - sizeof(struct ipv6hdr);
65 if (len > IPV6_MAXPLEN)
66 len = 0;
67 ipv6_hdr(skb)->payload_len = htons(len);
68
b2e0b385
JE
69 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
70 skb_dst(skb)->dev, dst_output);
ef76bc23
HX
71}
72
73int ip6_local_out(struct sk_buff *skb)
74{
75 int err;
76
77 err = __ip6_local_out(skb);
78 if (likely(err == 1))
79 err = dst_output(skb);
80
81 return err;
82}
83EXPORT_SYMBOL_GPL(ip6_local_out);
84
1da177e4
LT
85/* dev_loopback_xmit for use with netfilter. */
86static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
87{
459a98ed 88 skb_reset_mac_header(newskb);
bbe735e4 89 __skb_pull(newskb, skb_network_offset(newskb));
1da177e4
LT
90 newskb->pkt_type = PACKET_LOOPBACK;
91 newskb->ip_summed = CHECKSUM_UNNECESSARY;
adf30907 92 WARN_ON(!skb_dst(newskb));
1da177e4
LT
93
94 netif_rx(newskb);
95 return 0;
96}
97
9e508490 98static int ip6_finish_output2(struct sk_buff *skb)
1da177e4 99{
adf30907 100 struct dst_entry *dst = skb_dst(skb);
1da177e4
LT
101 struct net_device *dev = dst->dev;
102
103 skb->protocol = htons(ETH_P_IPV6);
104 skb->dev = dev;
105
0660e03f 106 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
adf30907 107 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1da177e4 108
7ad6848c 109 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
bd91b8bf
BT
110 ((mroute6_socket(dev_net(dev)) &&
111 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
7bc570c8
YH
112 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
113 &ipv6_hdr(skb)->saddr))) {
1da177e4
LT
114 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
115
116 /* Do not check for IFF_ALLMULTI; multicast routing
117 is not supported in any case.
118 */
119 if (newskb)
b2e0b385
JE
120 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
121 newskb, NULL, newskb->dev,
1da177e4
LT
122 ip6_dev_loopback_xmit);
123
0660e03f 124 if (ipv6_hdr(skb)->hop_limit == 0) {
3bd653c8
DL
125 IP6_INC_STATS(dev_net(dev), idev,
126 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
127 kfree_skb(skb);
128 return 0;
129 }
130 }
131
edf391ff
NH
132 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
133 skb->len);
1da177e4
LT
134 }
135
9e508490
JE
136 if (dst->hh)
137 return neigh_hh_output(dst->hh, skb);
138 else if (dst->neighbour)
139 return dst->neighbour->output(skb);
140
141 IP6_INC_STATS_BH(dev_net(dst->dev),
142 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
143 kfree_skb(skb);
144 return -EINVAL;
1da177e4
LT
145}
146
628a5c56
JH
147static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
148{
149 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
150
151 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
adf30907 152 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
628a5c56
JH
153}
154
9e508490
JE
155static int ip6_finish_output(struct sk_buff *skb)
156{
157 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
158 dst_allfrag(skb_dst(skb)))
159 return ip6_fragment(skb, ip6_finish_output2);
160 else
161 return ip6_finish_output2(skb);
162}
163
1da177e4
LT
164int ip6_output(struct sk_buff *skb)
165{
9e508490 166 struct net_device *dev = skb_dst(skb)->dev;
adf30907 167 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
778d80be 168 if (unlikely(idev->cnf.disable_ipv6)) {
9e508490 169 IP6_INC_STATS(dev_net(dev), idev,
3bd653c8 170 IPSTATS_MIB_OUTDISCARDS);
778d80be
YH
171 kfree_skb(skb);
172 return 0;
173 }
174
9c6eb28a
JE
175 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
176 ip6_finish_output,
177 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
1da177e4 178}
e281b198 179EXPORT_SYMBOL_GPL(ip6_output);
1da177e4 180
1da177e4
LT
181/*
182 * xmit an sk_buff (used by TCP)
183 */
184
185int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
186 struct ipv6_txoptions *opt, int ipfragok)
187{
3bd653c8 188 struct net *net = sock_net(sk);
b30bd282 189 struct ipv6_pinfo *np = inet6_sk(sk);
1da177e4 190 struct in6_addr *first_hop = &fl->fl6_dst;
adf30907 191 struct dst_entry *dst = skb_dst(skb);
1da177e4
LT
192 struct ipv6hdr *hdr;
193 u8 proto = fl->proto;
194 int seg_len = skb->len;
e651f03a
GR
195 int hlimit = -1;
196 int tclass = 0;
1da177e4
LT
197 u32 mtu;
198
199 if (opt) {
c2636b4d 200 unsigned int head_room;
1da177e4
LT
201
202 /* First: exthdrs may take lots of space (~8K for now)
203 MAX_HEADER is not enough.
204 */
205 head_room = opt->opt_nflen + opt->opt_flen;
206 seg_len += head_room;
207 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
208
209 if (skb_headroom(skb) < head_room) {
210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
a11d206d 211 if (skb2 == NULL) {
adf30907 212 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d
YH
213 IPSTATS_MIB_OUTDISCARDS);
214 kfree_skb(skb);
1da177e4
LT
215 return -ENOBUFS;
216 }
a11d206d
YH
217 kfree_skb(skb);
218 skb = skb2;
1da177e4
LT
219 if (sk)
220 skb_set_owner_w(skb, sk);
221 }
222 if (opt->opt_flen)
223 ipv6_push_frag_opts(skb, opt, &proto);
224 if (opt->opt_nflen)
225 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
226 }
227
e2d1bca7
ACM
228 skb_push(skb, sizeof(struct ipv6hdr));
229 skb_reset_network_header(skb);
0660e03f 230 hdr = ipv6_hdr(skb);
1da177e4 231
77e2f14f
WY
232 /* Allow local fragmentation. */
233 if (ipfragok)
234 skb->local_df = 1;
235
1da177e4
LT
236 /*
237 * Fill in the IPv6 header
238 */
e651f03a
GR
239 if (np) {
240 tclass = np->tclass;
1da177e4 241 hlimit = np->hop_limit;
e651f03a 242 }
1da177e4 243 if (hlimit < 0)
6b75d090 244 hlimit = ip6_dst_hoplimit(dst);
1da177e4 245
90bcaf7b 246 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
41a1f8ea 247
1da177e4
LT
248 hdr->payload_len = htons(seg_len);
249 hdr->nexthdr = proto;
250 hdr->hop_limit = hlimit;
251
252 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
253 ipv6_addr_copy(&hdr->daddr, first_hop);
254
a2c2064f 255 skb->priority = sk->sk_priority;
4a19ec58 256 skb->mark = sk->sk_mark;
a2c2064f 257
1da177e4 258 mtu = dst_mtu(dst);
283d07ac 259 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
adf30907 260 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
edf391ff 261 IPSTATS_MIB_OUT, skb->len);
b2e0b385
JE
262 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
263 dst->dev, dst_output);
1da177e4
LT
264 }
265
266 if (net_ratelimit())
267 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
268 skb->dev = dst->dev;
3ffe533c 269 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
adf30907 270 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
271 kfree_skb(skb);
272 return -EMSGSIZE;
273}
274
7159039a
YH
275EXPORT_SYMBOL(ip6_xmit);
276
1da177e4
LT
277/*
278 * To avoid extra problems ND packets are send through this
279 * routine. It's code duplication but I really want to avoid
280 * extra checks since ipv6_build_header is used by TCP (which
281 * is for us performance critical)
282 */
283
284int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
9acd9f3a 285 const struct in6_addr *saddr, const struct in6_addr *daddr,
1da177e4
LT
286 int proto, int len)
287{
288 struct ipv6_pinfo *np = inet6_sk(sk);
289 struct ipv6hdr *hdr;
290 int totlen;
291
292 skb->protocol = htons(ETH_P_IPV6);
293 skb->dev = dev;
294
295 totlen = len + sizeof(struct ipv6hdr);
296
55f79cc0
ACM
297 skb_reset_network_header(skb);
298 skb_put(skb, sizeof(struct ipv6hdr));
0660e03f 299 hdr = ipv6_hdr(skb);
1da177e4 300
ae08e1f0 301 *(__be32*)hdr = htonl(0x60000000);
1da177e4
LT
302
303 hdr->payload_len = htons(len);
304 hdr->nexthdr = proto;
305 hdr->hop_limit = np->hop_limit;
306
307 ipv6_addr_copy(&hdr->saddr, saddr);
308 ipv6_addr_copy(&hdr->daddr, daddr);
309
310 return 0;
311}
312
313static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
314{
315 struct ip6_ra_chain *ra;
316 struct sock *last = NULL;
317
318 read_lock(&ip6_ra_lock);
319 for (ra = ip6_ra_chain; ra; ra = ra->next) {
320 struct sock *sk = ra->sk;
0bd1b59b
AM
321 if (sk && ra->sel == sel &&
322 (!sk->sk_bound_dev_if ||
323 sk->sk_bound_dev_if == skb->dev->ifindex)) {
1da177e4
LT
324 if (last) {
325 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
326 if (skb2)
327 rawv6_rcv(last, skb2);
328 }
329 last = sk;
330 }
331 }
332
333 if (last) {
334 rawv6_rcv(last, skb);
335 read_unlock(&ip6_ra_lock);
336 return 1;
337 }
338 read_unlock(&ip6_ra_lock);
339 return 0;
340}
341
e21e0b5f
VN
342static int ip6_forward_proxy_check(struct sk_buff *skb)
343{
0660e03f 344 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f
VN
345 u8 nexthdr = hdr->nexthdr;
346 int offset;
347
348 if (ipv6_ext_hdr(nexthdr)) {
349 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
350 if (offset < 0)
351 return 0;
352 } else
353 offset = sizeof(struct ipv6hdr);
354
355 if (nexthdr == IPPROTO_ICMPV6) {
356 struct icmp6hdr *icmp6;
357
d56f90a7
ACM
358 if (!pskb_may_pull(skb, (skb_network_header(skb) +
359 offset + 1 - skb->data)))
e21e0b5f
VN
360 return 0;
361
d56f90a7 362 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
363
364 switch (icmp6->icmp6_type) {
365 case NDISC_ROUTER_SOLICITATION:
366 case NDISC_ROUTER_ADVERTISEMENT:
367 case NDISC_NEIGHBOUR_SOLICITATION:
368 case NDISC_NEIGHBOUR_ADVERTISEMENT:
369 case NDISC_REDIRECT:
370 /* For reaction involving unicast neighbor discovery
371 * message destined to the proxied address, pass it to
372 * input function.
373 */
374 return 1;
375 default:
376 break;
377 }
378 }
379
74553b09
VN
380 /*
381 * The proxying router can't forward traffic sent to a link-local
382 * address, so signal the sender and discard the packet. This
383 * behavior is clarified by the MIPv6 specification.
384 */
385 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
386 dst_link_failure(skb);
387 return -1;
388 }
389
e21e0b5f
VN
390 return 0;
391}
392
1da177e4
LT
393static inline int ip6_forward_finish(struct sk_buff *skb)
394{
395 return dst_output(skb);
396}
397
398int ip6_forward(struct sk_buff *skb)
399{
adf30907 400 struct dst_entry *dst = skb_dst(skb);
0660e03f 401 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 402 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 403 struct net *net = dev_net(dst->dev);
14f3ad6f 404 u32 mtu;
1ab1457c 405
53b7997f 406 if (net->ipv6.devconf_all->forwarding == 0)
1da177e4
LT
407 goto error;
408
4497b076
BH
409 if (skb_warn_if_lro(skb))
410 goto drop;
411
1da177e4 412 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
3bd653c8 413 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
1da177e4
LT
414 goto drop;
415 }
416
35fc92a9 417 skb_forward_csum(skb);
1da177e4
LT
418
419 /*
420 * We DO NOT make any processing on
421 * RA packets, pushing them to user level AS IS
422 * without ane WARRANTY that application will be able
423 * to interpret them. The reason is that we
424 * cannot make anything clever here.
425 *
426 * We are not end-node, so that if packet contains
427 * AH/ESP, we cannot make anything.
428 * Defragmentation also would be mistake, RA packets
429 * cannot be fragmented, because there is no warranty
430 * that different fragments will go along one path. --ANK
431 */
432 if (opt->ra) {
d56f90a7 433 u8 *ptr = skb_network_header(skb) + opt->ra;
1da177e4
LT
434 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
435 return 0;
436 }
437
438 /*
439 * check and decrement ttl
440 */
441 if (hdr->hop_limit <= 1) {
442 /* Force OUTPUT device used as source address */
443 skb->dev = dst->dev;
3ffe533c 444 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
483a47d2
DL
445 IP6_INC_STATS_BH(net,
446 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
1da177e4
LT
447
448 kfree_skb(skb);
449 return -ETIMEDOUT;
450 }
451
fbea49e1 452 /* XXX: idev->cnf.proxy_ndp? */
53b7997f 453 if (net->ipv6.devconf_all->proxy_ndp &&
8a3edd80 454 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09
VN
455 int proxied = ip6_forward_proxy_check(skb);
456 if (proxied > 0)
e21e0b5f 457 return ip6_input(skb);
74553b09 458 else if (proxied < 0) {
3bd653c8
DL
459 IP6_INC_STATS(net, ip6_dst_idev(dst),
460 IPSTATS_MIB_INDISCARDS);
74553b09
VN
461 goto drop;
462 }
e21e0b5f
VN
463 }
464
1da177e4 465 if (!xfrm6_route_forward(skb)) {
3bd653c8 466 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
1da177e4
LT
467 goto drop;
468 }
adf30907 469 dst = skb_dst(skb);
1da177e4
LT
470
471 /* IPv6 specs say nothing about it, but it is clear that we cannot
472 send redirects to source routed frames.
1e5dc146 473 We don't send redirects to frames decapsulated from IPsec.
1da177e4 474 */
1e5dc146 475 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
def8b4fa 476 !skb_sec_path(skb)) {
1da177e4
LT
477 struct in6_addr *target = NULL;
478 struct rt6_info *rt;
479 struct neighbour *n = dst->neighbour;
480
481 /*
482 * incoming and outgoing devices are the same
483 * send a redirect.
484 */
485
486 rt = (struct rt6_info *) dst;
487 if ((rt->rt6i_flags & RTF_GATEWAY))
488 target = (struct in6_addr*)&n->primary_key;
489 else
490 target = &hdr->daddr;
491
492 /* Limit redirects both by destination (here)
493 and by source (inside ndisc_send_redirect)
494 */
495 if (xrlim_allow(dst, 1*HZ))
496 ndisc_send_redirect(skb, n, target);
5bb1ab09
DS
497 } else {
498 int addrtype = ipv6_addr_type(&hdr->saddr);
499
1da177e4 500 /* This check is security critical. */
f81b2e7d
YH
501 if (addrtype == IPV6_ADDR_ANY ||
502 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
503 goto error;
504 if (addrtype & IPV6_ADDR_LINKLOCAL) {
505 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
3ffe533c 506 ICMPV6_NOT_NEIGHBOUR, 0);
5bb1ab09
DS
507 goto error;
508 }
1da177e4
LT
509 }
510
14f3ad6f
UW
511 mtu = dst_mtu(dst);
512 if (mtu < IPV6_MIN_MTU)
513 mtu = IPV6_MIN_MTU;
514
515 if (skb->len > mtu) {
1da177e4
LT
516 /* Again, force OUTPUT device used as source address */
517 skb->dev = dst->dev;
14f3ad6f 518 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
483a47d2
DL
519 IP6_INC_STATS_BH(net,
520 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
521 IP6_INC_STATS_BH(net,
522 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
523 kfree_skb(skb);
524 return -EMSGSIZE;
525 }
526
527 if (skb_cow(skb, dst->dev->hard_header_len)) {
3bd653c8 528 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
529 goto drop;
530 }
531
0660e03f 532 hdr = ipv6_hdr(skb);
1da177e4
LT
533
534 /* Mangling hops number delayed to point after skb COW */
1ab1457c 535
1da177e4
LT
536 hdr->hop_limit--;
537
483a47d2 538 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
b2e0b385 539 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
6e23ae2a 540 ip6_forward_finish);
1da177e4
LT
541
542error:
483a47d2 543 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
1da177e4
LT
544drop:
545 kfree_skb(skb);
546 return -EINVAL;
547}
548
549static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
550{
551 to->pkt_type = from->pkt_type;
552 to->priority = from->priority;
553 to->protocol = from->protocol;
adf30907
ED
554 skb_dst_drop(to);
555 skb_dst_set(to, dst_clone(skb_dst(from)));
1da177e4 556 to->dev = from->dev;
82e91ffe 557 to->mark = from->mark;
1da177e4
LT
558
559#ifdef CONFIG_NET_SCHED
560 to->tc_index = from->tc_index;
561#endif
e7ac05f3 562 nf_copy(to, from);
ba9dda3a
JK
563#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
564 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
565 to->nf_trace = from->nf_trace;
566#endif
984bc16c 567 skb_copy_secmark(to, from);
1da177e4
LT
568}
569
570int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
571{
572 u16 offset = sizeof(struct ipv6hdr);
0660e03f
ACM
573 struct ipv6_opt_hdr *exthdr =
574 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
27a884dc 575 unsigned int packet_len = skb->tail - skb->network_header;
1da177e4 576 int found_rhdr = 0;
0660e03f 577 *nexthdr = &ipv6_hdr(skb)->nexthdr;
1da177e4
LT
578
579 while (offset + 1 <= packet_len) {
580
581 switch (**nexthdr) {
582
583 case NEXTHDR_HOP:
27637df9 584 break;
1da177e4 585 case NEXTHDR_ROUTING:
27637df9
MN
586 found_rhdr = 1;
587 break;
1da177e4 588 case NEXTHDR_DEST:
59fbb3a6 589#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
27637df9
MN
590 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
591 break;
592#endif
593 if (found_rhdr)
594 return offset;
1da177e4
LT
595 break;
596 default :
597 return offset;
598 }
27637df9
MN
599
600 offset += ipv6_optlen(exthdr);
601 *nexthdr = &exthdr->nexthdr;
d56f90a7
ACM
602 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
603 offset);
1da177e4
LT
604 }
605
606 return offset;
607}
608
609static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
610{
1da177e4 611 struct sk_buff *frag;
adf30907 612 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
d91675f9 613 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
1da177e4
LT
614 struct ipv6hdr *tmp_hdr;
615 struct frag_hdr *fh;
616 unsigned int mtu, hlen, left, len;
ae08e1f0 617 __be32 frag_id = 0;
1da177e4
LT
618 int ptr, offset = 0, err=0;
619 u8 *prevhdr, nexthdr = 0;
adf30907 620 struct net *net = dev_net(skb_dst(skb)->dev);
1da177e4 621
1da177e4
LT
622 hlen = ip6_find_1stfragopt(skb, &prevhdr);
623 nexthdr = *prevhdr;
624
628a5c56 625 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
626
627 /* We must not fragment if the socket is set to force MTU discovery
14f3ad6f 628 * or if the skb it not generated by a local socket.
b881ef76 629 */
b5c15fc0 630 if (!skb->local_df) {
adf30907 631 skb->dev = skb_dst(skb)->dev;
3ffe533c 632 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
adf30907 633 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 634 IPSTATS_MIB_FRAGFAILS);
b881ef76
JH
635 kfree_skb(skb);
636 return -EMSGSIZE;
637 }
638
d91675f9
YH
639 if (np && np->frag_size < mtu) {
640 if (np->frag_size)
641 mtu = np->frag_size;
642 }
643 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4 644
4d9092bb 645 if (skb_has_frags(skb)) {
1da177e4 646 int first_len = skb_pagelen(skb);
29ffe1a5 647 int truesizes = 0;
1da177e4
LT
648
649 if (first_len - hlen > mtu ||
650 ((first_len - hlen) & 7) ||
651 skb_cloned(skb))
652 goto slow_path;
653
4d9092bb 654 skb_walk_frags(skb, frag) {
1da177e4
LT
655 /* Correct geometry. */
656 if (frag->len > mtu ||
657 ((frag->len & 7) && frag->next) ||
658 skb_headroom(frag) < hlen)
659 goto slow_path;
660
1da177e4
LT
661 /* Partially cloned skb? */
662 if (skb_shared(frag))
663 goto slow_path;
2fdba6b0
HX
664
665 BUG_ON(frag->sk);
666 if (skb->sk) {
2fdba6b0
HX
667 frag->sk = skb->sk;
668 frag->destructor = sock_wfree;
29ffe1a5 669 truesizes += frag->truesize;
2fdba6b0 670 }
1da177e4
LT
671 }
672
673 err = 0;
674 offset = 0;
675 frag = skb_shinfo(skb)->frag_list;
4d9092bb 676 skb_frag_list_init(skb);
1da177e4
LT
677 /* BUILD HEADER */
678
9a217a1c 679 *prevhdr = NEXTHDR_FRAGMENT;
d56f90a7 680 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
1da177e4 681 if (!tmp_hdr) {
adf30907 682 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 683 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
684 return -ENOMEM;
685 }
686
1da177e4
LT
687 __skb_pull(skb, hlen);
688 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
e2d1bca7
ACM
689 __skb_push(skb, hlen);
690 skb_reset_network_header(skb);
d56f90a7 691 memcpy(skb_network_header(skb), tmp_hdr, hlen);
1da177e4 692
7ea2f2c5 693 ipv6_select_ident(fh);
1da177e4
LT
694 fh->nexthdr = nexthdr;
695 fh->reserved = 0;
696 fh->frag_off = htons(IP6_MF);
697 frag_id = fh->identification;
698
699 first_len = skb_pagelen(skb);
700 skb->data_len = first_len - skb_headlen(skb);
29ffe1a5 701 skb->truesize -= truesizes;
1da177e4 702 skb->len = first_len;
0660e03f
ACM
703 ipv6_hdr(skb)->payload_len = htons(first_len -
704 sizeof(struct ipv6hdr));
a11d206d
YH
705
706 dst_hold(&rt->u.dst);
1da177e4
LT
707
708 for (;;) {
709 /* Prepare header of the next frame,
710 * before previous one went down. */
711 if (frag) {
712 frag->ip_summed = CHECKSUM_NONE;
badff6d0 713 skb_reset_transport_header(frag);
1da177e4 714 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
e2d1bca7
ACM
715 __skb_push(frag, hlen);
716 skb_reset_network_header(frag);
d56f90a7
ACM
717 memcpy(skb_network_header(frag), tmp_hdr,
718 hlen);
1da177e4
LT
719 offset += skb->len - hlen - sizeof(struct frag_hdr);
720 fh->nexthdr = nexthdr;
721 fh->reserved = 0;
722 fh->frag_off = htons(offset);
723 if (frag->next != NULL)
724 fh->frag_off |= htons(IP6_MF);
725 fh->identification = frag_id;
0660e03f
ACM
726 ipv6_hdr(frag)->payload_len =
727 htons(frag->len -
728 sizeof(struct ipv6hdr));
1da177e4
LT
729 ip6_copy_metadata(frag, skb);
730 }
1ab1457c 731
1da177e4 732 err = output(skb);
dafee490 733 if(!err)
3bd653c8
DL
734 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
735 IPSTATS_MIB_FRAGCREATES);
dafee490 736
1da177e4
LT
737 if (err || !frag)
738 break;
739
740 skb = frag;
741 frag = skb->next;
742 skb->next = NULL;
743 }
744
a51482bd 745 kfree(tmp_hdr);
1da177e4
LT
746
747 if (err == 0) {
3bd653c8
DL
748 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
749 IPSTATS_MIB_FRAGOKS);
a11d206d 750 dst_release(&rt->u.dst);
1da177e4
LT
751 return 0;
752 }
753
754 while (frag) {
755 skb = frag->next;
756 kfree_skb(frag);
757 frag = skb;
758 }
759
3bd653c8
DL
760 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
761 IPSTATS_MIB_FRAGFAILS);
a11d206d 762 dst_release(&rt->u.dst);
1da177e4
LT
763 return err;
764 }
765
766slow_path:
767 left = skb->len - hlen; /* Space per frame */
768 ptr = hlen; /* Where to start from */
769
770 /*
771 * Fragment the datagram.
772 */
773
774 *prevhdr = NEXTHDR_FRAGMENT;
775
776 /*
777 * Keep copying data until we run out.
778 */
779 while(left > 0) {
780 len = left;
781 /* IF: it doesn't fit, use 'mtu' - the data space left */
782 if (len > mtu)
783 len = mtu;
784 /* IF: we are not sending upto and including the packet end
785 then align the next start on an eight byte boundary */
786 if (len < left) {
787 len &= ~7;
788 }
789 /*
790 * Allocate buffer.
791 */
792
f5184d26 793 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
64ce2073 794 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
adf30907 795 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 796 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
797 err = -ENOMEM;
798 goto fail;
799 }
800
801 /*
802 * Set up data on packet
803 */
804
805 ip6_copy_metadata(frag, skb);
806 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
807 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
c1d2bbe1 808 skb_reset_network_header(frag);
badff6d0 809 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
b0e380b1
ACM
810 frag->transport_header = (frag->network_header + hlen +
811 sizeof(struct frag_hdr));
1da177e4
LT
812
813 /*
814 * Charge the memory for the fragment to any owner
815 * it might possess
816 */
817 if (skb->sk)
818 skb_set_owner_w(frag, skb->sk);
819
820 /*
821 * Copy the packet header into the new buffer.
822 */
d626f62b 823 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
1da177e4
LT
824
825 /*
826 * Build fragment header.
827 */
828 fh->nexthdr = nexthdr;
829 fh->reserved = 0;
f36d6ab1 830 if (!frag_id) {
7ea2f2c5 831 ipv6_select_ident(fh);
1da177e4
LT
832 frag_id = fh->identification;
833 } else
834 fh->identification = frag_id;
835
836 /*
837 * Copy a block of the IP datagram.
838 */
8984e41d 839 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
1da177e4
LT
840 BUG();
841 left -= len;
842
843 fh->frag_off = htons(offset);
844 if (left > 0)
845 fh->frag_off |= htons(IP6_MF);
0660e03f
ACM
846 ipv6_hdr(frag)->payload_len = htons(frag->len -
847 sizeof(struct ipv6hdr));
1da177e4
LT
848
849 ptr += len;
850 offset += len;
851
852 /*
853 * Put this fragment into the sending queue.
854 */
1da177e4
LT
855 err = output(frag);
856 if (err)
857 goto fail;
dafee490 858
adf30907 859 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
3bd653c8 860 IPSTATS_MIB_FRAGCREATES);
1da177e4 861 }
adf30907 862 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 863 IPSTATS_MIB_FRAGOKS);
1da177e4 864 kfree_skb(skb);
1da177e4
LT
865 return err;
866
867fail:
adf30907 868 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
a11d206d 869 IPSTATS_MIB_FRAGFAILS);
1ab1457c 870 kfree_skb(skb);
1da177e4
LT
871 return err;
872}
873
cf6b1982
YH
874static inline int ip6_rt_check(struct rt6key *rt_key,
875 struct in6_addr *fl_addr,
876 struct in6_addr *addr_cache)
877{
878 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
879 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
880}
881
497c615a
HX
882static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
883 struct dst_entry *dst,
884 struct flowi *fl)
1da177e4 885{
497c615a
HX
886 struct ipv6_pinfo *np = inet6_sk(sk);
887 struct rt6_info *rt = (struct rt6_info *)dst;
1da177e4 888
497c615a
HX
889 if (!dst)
890 goto out;
891
892 /* Yes, checking route validity in not connected
893 * case is not very simple. Take into account,
894 * that we do not support routing by source, TOS,
895 * and MSG_DONTROUTE --ANK (980726)
896 *
cf6b1982
YH
897 * 1. ip6_rt_check(): If route was host route,
898 * check that cached destination is current.
497c615a
HX
899 * If it is network route, we still may
900 * check its validity using saved pointer
901 * to the last used address: daddr_cache.
902 * We do not want to save whole address now,
903 * (because main consumer of this service
904 * is tcp, which has not this problem),
905 * so that the last trick works only on connected
906 * sockets.
907 * 2. oif also should be the same.
908 */
cf6b1982 909 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
8e1ef0a9
YH
910#ifdef CONFIG_IPV6_SUBTREES
911 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
912#endif
cf6b1982 913 (fl->oif && fl->oif != dst->dev->ifindex)) {
497c615a
HX
914 dst_release(dst);
915 dst = NULL;
1da177e4
LT
916 }
917
497c615a
HX
918out:
919 return dst;
920}
921
922static int ip6_dst_lookup_tail(struct sock *sk,
923 struct dst_entry **dst, struct flowi *fl)
924{
925 int err;
3b1e0a65 926 struct net *net = sock_net(sk);
497c615a 927
1da177e4 928 if (*dst == NULL)
8a3edd80 929 *dst = ip6_route_output(net, sk, fl);
1da177e4
LT
930
931 if ((err = (*dst)->error))
932 goto out_err_release;
933
934 if (ipv6_addr_any(&fl->fl6_src)) {
191cd582 935 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
7cbca67c
YH
936 &fl->fl6_dst,
937 sk ? inet6_sk(sk)->srcprefs : 0,
938 &fl->fl6_src);
44456d37 939 if (err)
1da177e4 940 goto out_err_release;
1da177e4
LT
941 }
942
95c385b4 943#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
e550dfb0
NH
944 /*
945 * Here if the dst entry we've looked up
946 * has a neighbour entry that is in the INCOMPLETE
947 * state and the src address from the flow is
948 * marked as OPTIMISTIC, we release the found
949 * dst entry and replace it instead with the
950 * dst entry of the nexthop router
951 */
952 if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
953 struct inet6_ifaddr *ifp;
954 struct flowi fl_gw;
955 int redirect;
956
957 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
958 (*dst)->dev, 1);
959
960 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
961 if (ifp)
962 in6_ifa_put(ifp);
963
964 if (redirect) {
965 /*
966 * We need to get the dst entry for the
967 * default router instead
968 */
969 dst_release(*dst);
970 memcpy(&fl_gw, fl, sizeof(struct flowi));
971 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
972 *dst = ip6_route_output(net, sk, &fl_gw);
973 if ((err = (*dst)->error))
974 goto out_err_release;
95c385b4 975 }
e550dfb0 976 }
95c385b4
NH
977#endif
978
1da177e4
LT
979 return 0;
980
981out_err_release:
ca46f9c8 982 if (err == -ENETUNREACH)
483a47d2 983 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
984 dst_release(*dst);
985 *dst = NULL;
986 return err;
987}
34a0b3cd 988
497c615a
HX
989/**
990 * ip6_dst_lookup - perform route lookup on flow
991 * @sk: socket which provides route info
992 * @dst: pointer to dst_entry * for result
993 * @fl: flow to lookup
994 *
995 * This function performs a route lookup on the given flow.
996 *
997 * It returns zero on success, or a standard errno code on error.
998 */
999int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1000{
1001 *dst = NULL;
1002 return ip6_dst_lookup_tail(sk, dst, fl);
1003}
3cf3dc6c
ACM
1004EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1005
497c615a
HX
1006/**
1007 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
1008 * @sk: socket which provides the dst cache and route info
1009 * @dst: pointer to dst_entry * for result
1010 * @fl: flow to lookup
1011 *
1012 * This function performs a route lookup on the given flow with the
1013 * possibility of using the cached route in the socket if it is valid.
1014 * It will take the socket dst lock when operating on the dst cache.
1015 * As a result, this function can only be used in process context.
1016 *
1017 * It returns zero on success, or a standard errno code on error.
1018 */
1019int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1020{
1021 *dst = NULL;
1022 if (sk) {
1023 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1024 *dst = ip6_sk_dst_check(sk, *dst, fl);
1025 }
1026
1027 return ip6_dst_lookup_tail(sk, dst, fl);
1028}
1029EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1030
34a0b3cd 1031static inline int ip6_ufo_append_data(struct sock *sk,
e89e9cf5
AR
1032 int getfrag(void *from, char *to, int offset, int len,
1033 int odd, struct sk_buff *skb),
1034 void *from, int length, int hh_len, int fragheaderlen,
1035 int transhdrlen, int mtu,unsigned int flags)
1036
1037{
1038 struct sk_buff *skb;
1039 int err;
1040
1041 /* There is support for UDP large send offload by network
1042 * device, so create one single skb packet containing complete
1043 * udp datagram
1044 */
1045 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1046 skb = sock_alloc_send_skb(sk,
1047 hh_len + fragheaderlen + transhdrlen + 20,
1048 (flags & MSG_DONTWAIT), &err);
1049 if (skb == NULL)
1050 return -ENOMEM;
1051
1052 /* reserve space for Hardware header */
1053 skb_reserve(skb, hh_len);
1054
1055 /* create space for UDP/IP header */
1056 skb_put(skb,fragheaderlen + transhdrlen);
1057
1058 /* initialize network header pointer */
c1d2bbe1 1059 skb_reset_network_header(skb);
e89e9cf5
AR
1060
1061 /* initialize protocol header pointer */
b0e380b1 1062 skb->transport_header = skb->network_header + fragheaderlen;
e89e9cf5 1063
84fa7933 1064 skb->ip_summed = CHECKSUM_PARTIAL;
e89e9cf5
AR
1065 skb->csum = 0;
1066 sk->sk_sndmsg_off = 0;
1067 }
1068
1069 err = skb_append_datato_frags(sk,skb, getfrag, from,
1070 (length - transhdrlen));
1071 if (!err) {
1072 struct frag_hdr fhdr;
1073
c31d5326
SS
1074 /* Specify the length of each IPv6 datagram fragment.
1075 * It has to be a multiple of 8.
1076 */
1077 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1078 sizeof(struct frag_hdr)) & ~7;
f83ef8c0 1079 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
7ea2f2c5 1080 ipv6_select_ident(&fhdr);
e89e9cf5
AR
1081 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1082 __skb_queue_tail(&sk->sk_write_queue, skb);
1083
1084 return 0;
1085 }
1086 /* There is not enough support do UPD LSO,
1087 * so follow normal path
1088 */
1089 kfree_skb(skb);
1090
1091 return err;
1092}
1da177e4 1093
0178b695
HX
1094static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1095 gfp_t gfp)
1096{
1097 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1098}
1099
1100static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1101 gfp_t gfp)
1102{
1103 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1104}
1105
41a1f8ea
YH
1106int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1107 int offset, int len, int odd, struct sk_buff *skb),
1108 void *from, int length, int transhdrlen,
1109 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1110 struct rt6_info *rt, unsigned int flags)
1da177e4
LT
1111{
1112 struct inet_sock *inet = inet_sk(sk);
1113 struct ipv6_pinfo *np = inet6_sk(sk);
1114 struct sk_buff *skb;
1115 unsigned int maxfraglen, fragheaderlen;
1116 int exthdrlen;
1117 int hh_len;
1118 int mtu;
1119 int copy;
1120 int err;
1121 int offset = 0;
1122 int csummode = CHECKSUM_NONE;
1123
1124 if (flags&MSG_PROBE)
1125 return 0;
1126 if (skb_queue_empty(&sk->sk_write_queue)) {
1127 /*
1128 * setup for corking
1129 */
1130 if (opt) {
0178b695 1131 if (WARN_ON(np->cork.opt))
1da177e4 1132 return -EINVAL;
0178b695
HX
1133
1134 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1135 if (unlikely(np->cork.opt == NULL))
1136 return -ENOBUFS;
1137
1138 np->cork.opt->tot_len = opt->tot_len;
1139 np->cork.opt->opt_flen = opt->opt_flen;
1140 np->cork.opt->opt_nflen = opt->opt_nflen;
1141
1142 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1143 sk->sk_allocation);
1144 if (opt->dst0opt && !np->cork.opt->dst0opt)
1145 return -ENOBUFS;
1146
1147 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1148 sk->sk_allocation);
1149 if (opt->dst1opt && !np->cork.opt->dst1opt)
1150 return -ENOBUFS;
1151
1152 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1153 sk->sk_allocation);
1154 if (opt->hopopt && !np->cork.opt->hopopt)
1155 return -ENOBUFS;
1156
1157 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1158 sk->sk_allocation);
1159 if (opt->srcrt && !np->cork.opt->srcrt)
1160 return -ENOBUFS;
1161
1da177e4
LT
1162 /* need source address above miyazawa*/
1163 }
1164 dst_hold(&rt->u.dst);
c8cdaf99 1165 inet->cork.dst = &rt->u.dst;
1da177e4
LT
1166 inet->cork.fl = *fl;
1167 np->cork.hop_limit = hlimit;
41a1f8ea 1168 np->cork.tclass = tclass;
628a5c56
JH
1169 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1170 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
c7503609 1171 if (np->frag_size < mtu) {
d91675f9
YH
1172 if (np->frag_size)
1173 mtu = np->frag_size;
1174 }
1175 inet->cork.fragsize = mtu;
1da177e4
LT
1176 if (dst_allfrag(rt->u.dst.path))
1177 inet->cork.flags |= IPCORK_ALLFRAG;
1178 inet->cork.length = 0;
1179 sk->sk_sndmsg_page = NULL;
1180 sk->sk_sndmsg_off = 0;
01488942 1181 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
a1b05140 1182 rt->rt6i_nfheader_len;
1da177e4
LT
1183 length += exthdrlen;
1184 transhdrlen += exthdrlen;
1185 } else {
c8cdaf99 1186 rt = (struct rt6_info *)inet->cork.dst;
1da177e4 1187 fl = &inet->cork.fl;
0178b695 1188 opt = np->cork.opt;
1da177e4
LT
1189 transhdrlen = 0;
1190 exthdrlen = 0;
1191 mtu = inet->cork.fragsize;
1192 }
1193
1194 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1195
a1b05140 1196 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1197 (opt ? opt->opt_nflen : 0);
1da177e4
LT
1198 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1199
1200 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1201 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1202 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1203 return -EMSGSIZE;
1204 }
1205 }
1206
1207 /*
1208 * Let's try using as much space as possible.
1209 * Use MTU if total length of the message fits into the MTU.
1210 * Otherwise, we need to reserve fragment header and
1211 * fragment alignment (= 8-15 octects, in total).
1212 *
1213 * Note that we may need to "move" the data from the tail of
1ab1457c 1214 * of the buffer to the new fragment when we split
1da177e4
LT
1215 * the message.
1216 *
1ab1457c 1217 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1218 * at once if non-fragmentable extension headers
1219 * are too large.
1ab1457c 1220 * --yoshfuji
1da177e4
LT
1221 */
1222
1223 inet->cork.length += length;
e89e9cf5
AR
1224 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1225 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1226
baa829d8
PM
1227 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1228 fragheaderlen, transhdrlen, mtu,
1229 flags);
1230 if (err)
e89e9cf5 1231 goto error;
e89e9cf5
AR
1232 return 0;
1233 }
1da177e4
LT
1234
1235 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1236 goto alloc_new_skb;
1237
1238 while (length > 0) {
1239 /* Check if the remaining data fits into current packet. */
1240 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1241 if (copy < length)
1242 copy = maxfraglen - skb->len;
1243
1244 if (copy <= 0) {
1245 char *data;
1246 unsigned int datalen;
1247 unsigned int fraglen;
1248 unsigned int fraggap;
1249 unsigned int alloclen;
1250 struct sk_buff *skb_prev;
1251alloc_new_skb:
1252 skb_prev = skb;
1253
1254 /* There's no room in the current skb */
1255 if (skb_prev)
1256 fraggap = skb_prev->len - maxfraglen;
1257 else
1258 fraggap = 0;
1259
1260 /*
1261 * If remaining data exceeds the mtu,
1262 * we know we need more fragment(s).
1263 */
1264 datalen = length + fraggap;
1265 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1266 datalen = maxfraglen - fragheaderlen;
1267
1268 fraglen = datalen + fragheaderlen;
1269 if ((flags & MSG_MORE) &&
1270 !(rt->u.dst.dev->features&NETIF_F_SG))
1271 alloclen = mtu;
1272 else
1273 alloclen = datalen + fragheaderlen;
1274
1275 /*
1276 * The last fragment gets additional space at tail.
1277 * Note: we overallocate on fragments with MSG_MODE
1278 * because we have no idea if we're the last one.
1279 */
1280 if (datalen == length + fraggap)
1281 alloclen += rt->u.dst.trailer_len;
1282
1283 /*
1284 * We just reserve space for fragment header.
1ab1457c 1285 * Note: this may be overallocation if the message
1da177e4
LT
1286 * (without MSG_MORE) fits into the MTU.
1287 */
1288 alloclen += sizeof(struct frag_hdr);
1289
1290 if (transhdrlen) {
1291 skb = sock_alloc_send_skb(sk,
1292 alloclen + hh_len,
1293 (flags & MSG_DONTWAIT), &err);
1294 } else {
1295 skb = NULL;
1296 if (atomic_read(&sk->sk_wmem_alloc) <=
1297 2 * sk->sk_sndbuf)
1298 skb = sock_wmalloc(sk,
1299 alloclen + hh_len, 1,
1300 sk->sk_allocation);
1301 if (unlikely(skb == NULL))
1302 err = -ENOBUFS;
1303 }
1304 if (skb == NULL)
1305 goto error;
1306 /*
1307 * Fill in the control structures
1308 */
1309 skb->ip_summed = csummode;
1310 skb->csum = 0;
1311 /* reserve for fragmentation */
1312 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1313
1314 /*
1315 * Find where to start putting bytes
1316 */
1317 data = skb_put(skb, fraglen);
c14d2450 1318 skb_set_network_header(skb, exthdrlen);
1da177e4 1319 data += fragheaderlen;
b0e380b1
ACM
1320 skb->transport_header = (skb->network_header +
1321 fragheaderlen);
1da177e4
LT
1322 if (fraggap) {
1323 skb->csum = skb_copy_and_csum_bits(
1324 skb_prev, maxfraglen,
1325 data + transhdrlen, fraggap, 0);
1326 skb_prev->csum = csum_sub(skb_prev->csum,
1327 skb->csum);
1328 data += fraggap;
e9fa4f7b 1329 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
1330 }
1331 copy = datalen - transhdrlen - fraggap;
1332 if (copy < 0) {
1333 err = -EINVAL;
1334 kfree_skb(skb);
1335 goto error;
1336 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1337 err = -EFAULT;
1338 kfree_skb(skb);
1339 goto error;
1340 }
1341
1342 offset += copy;
1343 length -= datalen - fraggap;
1344 transhdrlen = 0;
1345 exthdrlen = 0;
1346 csummode = CHECKSUM_NONE;
1347
1348 /*
1349 * Put the packet on the pending queue
1350 */
1351 __skb_queue_tail(&sk->sk_write_queue, skb);
1352 continue;
1353 }
1354
1355 if (copy > length)
1356 copy = length;
1357
1358 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1359 unsigned int off;
1360
1361 off = skb->len;
1362 if (getfrag(from, skb_put(skb, copy),
1363 offset, copy, off, skb) < 0) {
1364 __skb_trim(skb, off);
1365 err = -EFAULT;
1366 goto error;
1367 }
1368 } else {
1369 int i = skb_shinfo(skb)->nr_frags;
1370 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1371 struct page *page = sk->sk_sndmsg_page;
1372 int off = sk->sk_sndmsg_off;
1373 unsigned int left;
1374
1375 if (page && (left = PAGE_SIZE - off) > 0) {
1376 if (copy >= left)
1377 copy = left;
1378 if (page != frag->page) {
1379 if (i == MAX_SKB_FRAGS) {
1380 err = -EMSGSIZE;
1381 goto error;
1382 }
1383 get_page(page);
1384 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1385 frag = &skb_shinfo(skb)->frags[i];
1386 }
1387 } else if(i < MAX_SKB_FRAGS) {
1388 if (copy > PAGE_SIZE)
1389 copy = PAGE_SIZE;
1390 page = alloc_pages(sk->sk_allocation, 0);
1391 if (page == NULL) {
1392 err = -ENOMEM;
1393 goto error;
1394 }
1395 sk->sk_sndmsg_page = page;
1396 sk->sk_sndmsg_off = 0;
1397
1398 skb_fill_page_desc(skb, i, page, 0, 0);
1399 frag = &skb_shinfo(skb)->frags[i];
1da177e4
LT
1400 } else {
1401 err = -EMSGSIZE;
1402 goto error;
1403 }
1404 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1405 err = -EFAULT;
1406 goto error;
1407 }
1408 sk->sk_sndmsg_off += copy;
1409 frag->size += copy;
1410 skb->len += copy;
1411 skb->data_len += copy;
f945fa7a
HX
1412 skb->truesize += copy;
1413 atomic_add(copy, &sk->sk_wmem_alloc);
1da177e4
LT
1414 }
1415 offset += copy;
1416 length -= copy;
1417 }
1418 return 0;
1419error:
1420 inet->cork.length -= length;
3bd653c8 1421 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1422 return err;
1423}
1424
bf138862
PE
1425static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1426{
0178b695
HX
1427 if (np->cork.opt) {
1428 kfree(np->cork.opt->dst0opt);
1429 kfree(np->cork.opt->dst1opt);
1430 kfree(np->cork.opt->hopopt);
1431 kfree(np->cork.opt->srcrt);
1432 kfree(np->cork.opt);
1433 np->cork.opt = NULL;
1434 }
1435
c8cdaf99
YH
1436 if (inet->cork.dst) {
1437 dst_release(inet->cork.dst);
1438 inet->cork.dst = NULL;
bf138862
PE
1439 inet->cork.flags &= ~IPCORK_ALLFRAG;
1440 }
1441 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1442}
1443
1da177e4
LT
1444int ip6_push_pending_frames(struct sock *sk)
1445{
1446 struct sk_buff *skb, *tmp_skb;
1447 struct sk_buff **tail_skb;
1448 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1449 struct inet_sock *inet = inet_sk(sk);
1450 struct ipv6_pinfo *np = inet6_sk(sk);
3bd653c8 1451 struct net *net = sock_net(sk);
1da177e4
LT
1452 struct ipv6hdr *hdr;
1453 struct ipv6_txoptions *opt = np->cork.opt;
c8cdaf99 1454 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1da177e4
LT
1455 struct flowi *fl = &inet->cork.fl;
1456 unsigned char proto = fl->proto;
1457 int err = 0;
1458
1459 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1460 goto out;
1461 tail_skb = &(skb_shinfo(skb)->frag_list);
1462
1463 /* move skb->data to ip header from ext header */
d56f90a7 1464 if (skb->data < skb_network_header(skb))
bbe735e4 1465 __skb_pull(skb, skb_network_offset(skb));
1da177e4 1466 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
cfe1fc77 1467 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1468 *tail_skb = tmp_skb;
1469 tail_skb = &(tmp_skb->next);
1470 skb->len += tmp_skb->len;
1471 skb->data_len += tmp_skb->len;
1da177e4 1472 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1473 tmp_skb->destructor = NULL;
1474 tmp_skb->sk = NULL;
1da177e4
LT
1475 }
1476
28a89453 1477 /* Allow local fragmentation. */
b5c15fc0 1478 if (np->pmtudisc < IPV6_PMTUDISC_DO)
28a89453
HX
1479 skb->local_df = 1;
1480
1da177e4 1481 ipv6_addr_copy(final_dst, &fl->fl6_dst);
cfe1fc77 1482 __skb_pull(skb, skb_network_header_len(skb));
1da177e4
LT
1483 if (opt && opt->opt_flen)
1484 ipv6_push_frag_opts(skb, opt, &proto);
1485 if (opt && opt->opt_nflen)
1486 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1487
e2d1bca7
ACM
1488 skb_push(skb, sizeof(struct ipv6hdr));
1489 skb_reset_network_header(skb);
0660e03f 1490 hdr = ipv6_hdr(skb);
1ab1457c 1491
90bcaf7b 1492 *(__be32*)hdr = fl->fl6_flowlabel |
41a1f8ea 1493 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1da177e4 1494
1da177e4
LT
1495 hdr->hop_limit = np->cork.hop_limit;
1496 hdr->nexthdr = proto;
1497 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1498 ipv6_addr_copy(&hdr->daddr, final_dst);
1499
a2c2064f 1500 skb->priority = sk->sk_priority;
4a19ec58 1501 skb->mark = sk->sk_mark;
a2c2064f 1502
adf30907 1503 skb_dst_set(skb, dst_clone(&rt->u.dst));
edf391ff 1504 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
14878f75 1505 if (proto == IPPROTO_ICMPV6) {
adf30907 1506 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
14878f75 1507
5a57d4c7 1508 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
e41b5368 1509 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
14878f75
DS
1510 }
1511
ef76bc23 1512 err = ip6_local_out(skb);
1da177e4
LT
1513 if (err) {
1514 if (err > 0)
6ce9e7b5 1515 err = net_xmit_errno(err);
1da177e4
LT
1516 if (err)
1517 goto error;
1518 }
1519
1520out:
bf138862 1521 ip6_cork_release(inet, np);
1da177e4
LT
1522 return err;
1523error:
06254914 1524 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1525 goto out;
1526}
1527
1528void ip6_flush_pending_frames(struct sock *sk)
1529{
1da177e4
LT
1530 struct sk_buff *skb;
1531
1532 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
adf30907
ED
1533 if (skb_dst(skb))
1534 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
e1f52208 1535 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1536 kfree_skb(skb);
1537 }
1538
bf138862 1539 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1da177e4 1540}