]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv6/ip6_output.c
net: reduce structures when XFRM=n
[net-next-2.6.git] / net / ipv6 / ip6_output.c
CommitLineData
1da177e4
LT
1/*
2 * IPv6 output functions
1ab1457c 3 * Linux INET6 implementation
1da177e4
LT
4 *
5 * Authors:
1ab1457c 6 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 7 *
1da177e4
LT
8 * Based on linux/net/ipv4/ip_output.c
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * Changes:
16 * A.N.Kuznetsov : airthmetics in fragmentation.
17 * extension headers are implemented.
18 * route changes now work.
19 * ip6_forward does not confuse sniffers.
20 * etc.
21 *
22 * H. von Brand : Added missing #include <linux/string.h>
23 * Imran Patel : frag id should be in NBO
24 * Kazunori MIYAZAWA @USAGI
25 * : add ip6_append_data and related functions
26 * for datagram xmit
27 */
28
1da177e4 29#include <linux/errno.h>
ef76bc23 30#include <linux/kernel.h>
1da177e4
LT
31#include <linux/string.h>
32#include <linux/socket.h>
33#include <linux/net.h>
34#include <linux/netdevice.h>
35#include <linux/if_arp.h>
36#include <linux/in6.h>
37#include <linux/tcp.h>
38#include <linux/route.h>
b59f45d0 39#include <linux/module.h>
1da177e4
LT
40
41#include <linux/netfilter.h>
42#include <linux/netfilter_ipv6.h>
43
44#include <net/sock.h>
45#include <net/snmp.h>
46
47#include <net/ipv6.h>
48#include <net/ndisc.h>
49#include <net/protocol.h>
50#include <net/ip6_route.h>
51#include <net/addrconf.h>
52#include <net/rawv6.h>
53#include <net/icmp.h>
54#include <net/xfrm.h>
55#include <net/checksum.h>
7bc570c8 56#include <linux/mroute6.h>
1da177e4
LT
57
58static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
59
60static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
61{
62 static u32 ipv6_fragmentation_id = 1;
63 static DEFINE_SPINLOCK(ip6_id_lock);
64
65 spin_lock_bh(&ip6_id_lock);
66 fhdr->identification = htonl(ipv6_fragmentation_id);
67 if (++ipv6_fragmentation_id == 0)
68 ipv6_fragmentation_id = 1;
69 spin_unlock_bh(&ip6_id_lock);
70}
71
ef76bc23
HX
72int __ip6_local_out(struct sk_buff *skb)
73{
74 int len;
75
76 len = skb->len - sizeof(struct ipv6hdr);
77 if (len > IPV6_MAXPLEN)
78 len = 0;
79 ipv6_hdr(skb)->payload_len = htons(len);
80
6e23ae2a 81 return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
ef76bc23
HX
82 dst_output);
83}
84
85int ip6_local_out(struct sk_buff *skb)
86{
87 int err;
88
89 err = __ip6_local_out(skb);
90 if (likely(err == 1))
91 err = dst_output(skb);
92
93 return err;
94}
95EXPORT_SYMBOL_GPL(ip6_local_out);
96
ad643a79 97static int ip6_output_finish(struct sk_buff *skb)
1da177e4 98{
1da177e4 99 struct dst_entry *dst = skb->dst;
1da177e4 100
3644f0ce
SH
101 if (dst->hh)
102 return neigh_hh_output(dst->hh, skb);
103 else if (dst->neighbour)
1da177e4
LT
104 return dst->neighbour->output(skb);
105
483a47d2
DL
106 IP6_INC_STATS_BH(dev_net(dst->dev),
107 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
108 kfree_skb(skb);
109 return -EINVAL;
110
111}
112
113/* dev_loopback_xmit for use with netfilter. */
114static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
115{
459a98ed 116 skb_reset_mac_header(newskb);
bbe735e4 117 __skb_pull(newskb, skb_network_offset(newskb));
1da177e4
LT
118 newskb->pkt_type = PACKET_LOOPBACK;
119 newskb->ip_summed = CHECKSUM_UNNECESSARY;
547b792c 120 WARN_ON(!newskb->dst);
1da177e4
LT
121
122 netif_rx(newskb);
123 return 0;
124}
125
126
127static int ip6_output2(struct sk_buff *skb)
128{
129 struct dst_entry *dst = skb->dst;
130 struct net_device *dev = dst->dev;
131
132 skb->protocol = htons(ETH_P_IPV6);
133 skb->dev = dev;
134
0660e03f 135 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
1da177e4 136 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
a11d206d 137 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1da177e4
LT
138
139 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
7bc570c8
YH
140 ((mroute6_socket && !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
141 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
142 &ipv6_hdr(skb)->saddr))) {
1da177e4
LT
143 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
144
145 /* Do not check for IFF_ALLMULTI; multicast routing
146 is not supported in any case.
147 */
148 if (newskb)
6e23ae2a
PM
149 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
150 NULL, newskb->dev,
1da177e4
LT
151 ip6_dev_loopback_xmit);
152
0660e03f 153 if (ipv6_hdr(skb)->hop_limit == 0) {
3bd653c8
DL
154 IP6_INC_STATS(dev_net(dev), idev,
155 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
156 kfree_skb(skb);
157 return 0;
158 }
159 }
160
3bd653c8 161 IP6_INC_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCASTPKTS);
1da177e4
LT
162 }
163
6e23ae2a
PM
164 return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
165 ip6_output_finish);
1da177e4
LT
166}
167
628a5c56
JH
168static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
169{
170 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
171
172 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
173 skb->dst->dev->mtu : dst_mtu(skb->dst);
174}
175
1da177e4
LT
176int ip6_output(struct sk_buff *skb)
177{
778d80be
YH
178 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
179 if (unlikely(idev->cnf.disable_ipv6)) {
3bd653c8
DL
180 IP6_INC_STATS(dev_net(skb->dst->dev), idev,
181 IPSTATS_MIB_OUTDISCARDS);
778d80be
YH
182 kfree_skb(skb);
183 return 0;
184 }
185
628a5c56 186 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
e89e9cf5 187 dst_allfrag(skb->dst))
1da177e4
LT
188 return ip6_fragment(skb, ip6_output2);
189 else
190 return ip6_output2(skb);
191}
192
1da177e4
LT
193/*
194 * xmit an sk_buff (used by TCP)
195 */
196
197int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
198 struct ipv6_txoptions *opt, int ipfragok)
199{
3bd653c8 200 struct net *net = sock_net(sk);
b30bd282 201 struct ipv6_pinfo *np = inet6_sk(sk);
1da177e4
LT
202 struct in6_addr *first_hop = &fl->fl6_dst;
203 struct dst_entry *dst = skb->dst;
204 struct ipv6hdr *hdr;
205 u8 proto = fl->proto;
206 int seg_len = skb->len;
41a1f8ea 207 int hlimit, tclass;
1da177e4
LT
208 u32 mtu;
209
210 if (opt) {
c2636b4d 211 unsigned int head_room;
1da177e4
LT
212
213 /* First: exthdrs may take lots of space (~8K for now)
214 MAX_HEADER is not enough.
215 */
216 head_room = opt->opt_nflen + opt->opt_flen;
217 seg_len += head_room;
218 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
219
220 if (skb_headroom(skb) < head_room) {
221 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
a11d206d 222 if (skb2 == NULL) {
3bd653c8 223 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
a11d206d
YH
224 IPSTATS_MIB_OUTDISCARDS);
225 kfree_skb(skb);
1da177e4
LT
226 return -ENOBUFS;
227 }
a11d206d
YH
228 kfree_skb(skb);
229 skb = skb2;
1da177e4
LT
230 if (sk)
231 skb_set_owner_w(skb, sk);
232 }
233 if (opt->opt_flen)
234 ipv6_push_frag_opts(skb, opt, &proto);
235 if (opt->opt_nflen)
236 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
237 }
238
e2d1bca7
ACM
239 skb_push(skb, sizeof(struct ipv6hdr));
240 skb_reset_network_header(skb);
0660e03f 241 hdr = ipv6_hdr(skb);
1da177e4 242
77e2f14f
WY
243 /* Allow local fragmentation. */
244 if (ipfragok)
245 skb->local_df = 1;
246
1da177e4
LT
247 /*
248 * Fill in the IPv6 header
249 */
250
1da177e4
LT
251 hlimit = -1;
252 if (np)
253 hlimit = np->hop_limit;
254 if (hlimit < 0)
6b75d090 255 hlimit = ip6_dst_hoplimit(dst);
1da177e4 256
41a1f8ea
YH
257 tclass = -1;
258 if (np)
259 tclass = np->tclass;
260 if (tclass < 0)
261 tclass = 0;
262
90bcaf7b 263 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
41a1f8ea 264
1da177e4
LT
265 hdr->payload_len = htons(seg_len);
266 hdr->nexthdr = proto;
267 hdr->hop_limit = hlimit;
268
269 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
270 ipv6_addr_copy(&hdr->daddr, first_hop);
271
a2c2064f 272 skb->priority = sk->sk_priority;
4a19ec58 273 skb->mark = sk->sk_mark;
a2c2064f 274
1da177e4 275 mtu = dst_mtu(dst);
283d07ac 276 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
3bd653c8 277 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
a11d206d 278 IPSTATS_MIB_OUTREQUESTS);
6e23ae2a 279 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
6869c4d8 280 dst_output);
1da177e4
LT
281 }
282
283 if (net_ratelimit())
284 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
285 skb->dev = dst->dev;
286 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
3bd653c8 287 IP6_INC_STATS(net, ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
288 kfree_skb(skb);
289 return -EMSGSIZE;
290}
291
7159039a
YH
292EXPORT_SYMBOL(ip6_xmit);
293
1da177e4
LT
294/*
295 * To avoid extra problems ND packets are send through this
296 * routine. It's code duplication but I really want to avoid
297 * extra checks since ipv6_build_header is used by TCP (which
298 * is for us performance critical)
299 */
300
301int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
9acd9f3a 302 const struct in6_addr *saddr, const struct in6_addr *daddr,
1da177e4
LT
303 int proto, int len)
304{
305 struct ipv6_pinfo *np = inet6_sk(sk);
306 struct ipv6hdr *hdr;
307 int totlen;
308
309 skb->protocol = htons(ETH_P_IPV6);
310 skb->dev = dev;
311
312 totlen = len + sizeof(struct ipv6hdr);
313
55f79cc0
ACM
314 skb_reset_network_header(skb);
315 skb_put(skb, sizeof(struct ipv6hdr));
0660e03f 316 hdr = ipv6_hdr(skb);
1da177e4 317
ae08e1f0 318 *(__be32*)hdr = htonl(0x60000000);
1da177e4
LT
319
320 hdr->payload_len = htons(len);
321 hdr->nexthdr = proto;
322 hdr->hop_limit = np->hop_limit;
323
324 ipv6_addr_copy(&hdr->saddr, saddr);
325 ipv6_addr_copy(&hdr->daddr, daddr);
326
327 return 0;
328}
329
330static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
331{
332 struct ip6_ra_chain *ra;
333 struct sock *last = NULL;
334
335 read_lock(&ip6_ra_lock);
336 for (ra = ip6_ra_chain; ra; ra = ra->next) {
337 struct sock *sk = ra->sk;
0bd1b59b
AM
338 if (sk && ra->sel == sel &&
339 (!sk->sk_bound_dev_if ||
340 sk->sk_bound_dev_if == skb->dev->ifindex)) {
1da177e4
LT
341 if (last) {
342 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
343 if (skb2)
344 rawv6_rcv(last, skb2);
345 }
346 last = sk;
347 }
348 }
349
350 if (last) {
351 rawv6_rcv(last, skb);
352 read_unlock(&ip6_ra_lock);
353 return 1;
354 }
355 read_unlock(&ip6_ra_lock);
356 return 0;
357}
358
e21e0b5f
VN
359static int ip6_forward_proxy_check(struct sk_buff *skb)
360{
0660e03f 361 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f
VN
362 u8 nexthdr = hdr->nexthdr;
363 int offset;
364
365 if (ipv6_ext_hdr(nexthdr)) {
366 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
367 if (offset < 0)
368 return 0;
369 } else
370 offset = sizeof(struct ipv6hdr);
371
372 if (nexthdr == IPPROTO_ICMPV6) {
373 struct icmp6hdr *icmp6;
374
d56f90a7
ACM
375 if (!pskb_may_pull(skb, (skb_network_header(skb) +
376 offset + 1 - skb->data)))
e21e0b5f
VN
377 return 0;
378
d56f90a7 379 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
380
381 switch (icmp6->icmp6_type) {
382 case NDISC_ROUTER_SOLICITATION:
383 case NDISC_ROUTER_ADVERTISEMENT:
384 case NDISC_NEIGHBOUR_SOLICITATION:
385 case NDISC_NEIGHBOUR_ADVERTISEMENT:
386 case NDISC_REDIRECT:
387 /* For reaction involving unicast neighbor discovery
388 * message destined to the proxied address, pass it to
389 * input function.
390 */
391 return 1;
392 default:
393 break;
394 }
395 }
396
74553b09
VN
397 /*
398 * The proxying router can't forward traffic sent to a link-local
399 * address, so signal the sender and discard the packet. This
400 * behavior is clarified by the MIPv6 specification.
401 */
402 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
403 dst_link_failure(skb);
404 return -1;
405 }
406
e21e0b5f
VN
407 return 0;
408}
409
1da177e4
LT
410static inline int ip6_forward_finish(struct sk_buff *skb)
411{
412 return dst_output(skb);
413}
414
415int ip6_forward(struct sk_buff *skb)
416{
417 struct dst_entry *dst = skb->dst;
0660e03f 418 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 419 struct inet6_skb_parm *opt = IP6CB(skb);
c346dca1 420 struct net *net = dev_net(dst->dev);
1ab1457c 421
53b7997f 422 if (net->ipv6.devconf_all->forwarding == 0)
1da177e4
LT
423 goto error;
424
4497b076
BH
425 if (skb_warn_if_lro(skb))
426 goto drop;
427
1da177e4 428 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
3bd653c8 429 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
1da177e4
LT
430 goto drop;
431 }
432
35fc92a9 433 skb_forward_csum(skb);
1da177e4
LT
434
435 /*
436 * We DO NOT make any processing on
437 * RA packets, pushing them to user level AS IS
438 * without ane WARRANTY that application will be able
439 * to interpret them. The reason is that we
440 * cannot make anything clever here.
441 *
442 * We are not end-node, so that if packet contains
443 * AH/ESP, we cannot make anything.
444 * Defragmentation also would be mistake, RA packets
445 * cannot be fragmented, because there is no warranty
446 * that different fragments will go along one path. --ANK
447 */
448 if (opt->ra) {
d56f90a7 449 u8 *ptr = skb_network_header(skb) + opt->ra;
1da177e4
LT
450 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
451 return 0;
452 }
453
454 /*
455 * check and decrement ttl
456 */
457 if (hdr->hop_limit <= 1) {
458 /* Force OUTPUT device used as source address */
459 skb->dev = dst->dev;
460 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
461 0, skb->dev);
483a47d2
DL
462 IP6_INC_STATS_BH(net,
463 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
1da177e4
LT
464
465 kfree_skb(skb);
466 return -ETIMEDOUT;
467 }
468
fbea49e1 469 /* XXX: idev->cnf.proxy_ndp? */
53b7997f 470 if (net->ipv6.devconf_all->proxy_ndp &&
8a3edd80 471 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
74553b09
VN
472 int proxied = ip6_forward_proxy_check(skb);
473 if (proxied > 0)
e21e0b5f 474 return ip6_input(skb);
74553b09 475 else if (proxied < 0) {
3bd653c8
DL
476 IP6_INC_STATS(net, ip6_dst_idev(dst),
477 IPSTATS_MIB_INDISCARDS);
74553b09
VN
478 goto drop;
479 }
e21e0b5f
VN
480 }
481
1da177e4 482 if (!xfrm6_route_forward(skb)) {
3bd653c8 483 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
1da177e4
LT
484 goto drop;
485 }
486 dst = skb->dst;
487
488 /* IPv6 specs say nothing about it, but it is clear that we cannot
489 send redirects to source routed frames.
1e5dc146 490 We don't send redirects to frames decapsulated from IPsec.
1da177e4 491 */
1e5dc146 492 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
def8b4fa 493 !skb_sec_path(skb)) {
1da177e4
LT
494 struct in6_addr *target = NULL;
495 struct rt6_info *rt;
496 struct neighbour *n = dst->neighbour;
497
498 /*
499 * incoming and outgoing devices are the same
500 * send a redirect.
501 */
502
503 rt = (struct rt6_info *) dst;
504 if ((rt->rt6i_flags & RTF_GATEWAY))
505 target = (struct in6_addr*)&n->primary_key;
506 else
507 target = &hdr->daddr;
508
509 /* Limit redirects both by destination (here)
510 and by source (inside ndisc_send_redirect)
511 */
512 if (xrlim_allow(dst, 1*HZ))
513 ndisc_send_redirect(skb, n, target);
5bb1ab09
DS
514 } else {
515 int addrtype = ipv6_addr_type(&hdr->saddr);
516
1da177e4 517 /* This check is security critical. */
f81b2e7d
YH
518 if (addrtype == IPV6_ADDR_ANY ||
519 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
5bb1ab09
DS
520 goto error;
521 if (addrtype & IPV6_ADDR_LINKLOCAL) {
522 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
523 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
524 goto error;
525 }
1da177e4
LT
526 }
527
528 if (skb->len > dst_mtu(dst)) {
529 /* Again, force OUTPUT device used as source address */
530 skb->dev = dst->dev;
531 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
483a47d2
DL
532 IP6_INC_STATS_BH(net,
533 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
534 IP6_INC_STATS_BH(net,
535 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
536 kfree_skb(skb);
537 return -EMSGSIZE;
538 }
539
540 if (skb_cow(skb, dst->dev->hard_header_len)) {
3bd653c8 541 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
542 goto drop;
543 }
544
0660e03f 545 hdr = ipv6_hdr(skb);
1da177e4
LT
546
547 /* Mangling hops number delayed to point after skb COW */
1ab1457c 548
1da177e4
LT
549 hdr->hop_limit--;
550
483a47d2 551 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
6e23ae2a
PM
552 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
553 ip6_forward_finish);
1da177e4
LT
554
555error:
483a47d2 556 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
1da177e4
LT
557drop:
558 kfree_skb(skb);
559 return -EINVAL;
560}
561
562static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
563{
564 to->pkt_type = from->pkt_type;
565 to->priority = from->priority;
566 to->protocol = from->protocol;
1da177e4
LT
567 dst_release(to->dst);
568 to->dst = dst_clone(from->dst);
569 to->dev = from->dev;
82e91ffe 570 to->mark = from->mark;
1da177e4
LT
571
572#ifdef CONFIG_NET_SCHED
573 to->tc_index = from->tc_index;
574#endif
e7ac05f3 575 nf_copy(to, from);
ba9dda3a
JK
576#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
577 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
578 to->nf_trace = from->nf_trace;
579#endif
984bc16c 580 skb_copy_secmark(to, from);
1da177e4
LT
581}
582
583int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
584{
585 u16 offset = sizeof(struct ipv6hdr);
0660e03f
ACM
586 struct ipv6_opt_hdr *exthdr =
587 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
27a884dc 588 unsigned int packet_len = skb->tail - skb->network_header;
1da177e4 589 int found_rhdr = 0;
0660e03f 590 *nexthdr = &ipv6_hdr(skb)->nexthdr;
1da177e4
LT
591
592 while (offset + 1 <= packet_len) {
593
594 switch (**nexthdr) {
595
596 case NEXTHDR_HOP:
27637df9 597 break;
1da177e4 598 case NEXTHDR_ROUTING:
27637df9
MN
599 found_rhdr = 1;
600 break;
1da177e4 601 case NEXTHDR_DEST:
59fbb3a6 602#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
27637df9
MN
603 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
604 break;
605#endif
606 if (found_rhdr)
607 return offset;
1da177e4
LT
608 break;
609 default :
610 return offset;
611 }
27637df9
MN
612
613 offset += ipv6_optlen(exthdr);
614 *nexthdr = &exthdr->nexthdr;
d56f90a7
ACM
615 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
616 offset);
1da177e4
LT
617 }
618
619 return offset;
620}
621
622static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
623{
1da177e4
LT
624 struct sk_buff *frag;
625 struct rt6_info *rt = (struct rt6_info*)skb->dst;
d91675f9 626 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
1da177e4
LT
627 struct ipv6hdr *tmp_hdr;
628 struct frag_hdr *fh;
629 unsigned int mtu, hlen, left, len;
ae08e1f0 630 __be32 frag_id = 0;
1da177e4
LT
631 int ptr, offset = 0, err=0;
632 u8 *prevhdr, nexthdr = 0;
3bd653c8 633 struct net *net = dev_net(skb->dst->dev);
1da177e4 634
1da177e4
LT
635 hlen = ip6_find_1stfragopt(skb, &prevhdr);
636 nexthdr = *prevhdr;
637
628a5c56 638 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
639
640 /* We must not fragment if the socket is set to force MTU discovery
641 * or if the skb it not generated by a local socket. (This last
642 * check should be redundant, but it's free.)
643 */
b5c15fc0 644 if (!skb->local_df) {
b881ef76
JH
645 skb->dev = skb->dst->dev;
646 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
3bd653c8
DL
647 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
648 IPSTATS_MIB_FRAGFAILS);
b881ef76
JH
649 kfree_skb(skb);
650 return -EMSGSIZE;
651 }
652
d91675f9
YH
653 if (np && np->frag_size < mtu) {
654 if (np->frag_size)
655 mtu = np->frag_size;
656 }
657 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4
LT
658
659 if (skb_shinfo(skb)->frag_list) {
660 int first_len = skb_pagelen(skb);
29ffe1a5 661 int truesizes = 0;
1da177e4
LT
662
663 if (first_len - hlen > mtu ||
664 ((first_len - hlen) & 7) ||
665 skb_cloned(skb))
666 goto slow_path;
667
668 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
669 /* Correct geometry. */
670 if (frag->len > mtu ||
671 ((frag->len & 7) && frag->next) ||
672 skb_headroom(frag) < hlen)
673 goto slow_path;
674
1da177e4
LT
675 /* Partially cloned skb? */
676 if (skb_shared(frag))
677 goto slow_path;
2fdba6b0
HX
678
679 BUG_ON(frag->sk);
680 if (skb->sk) {
681 sock_hold(skb->sk);
682 frag->sk = skb->sk;
683 frag->destructor = sock_wfree;
29ffe1a5 684 truesizes += frag->truesize;
2fdba6b0 685 }
1da177e4
LT
686 }
687
688 err = 0;
689 offset = 0;
690 frag = skb_shinfo(skb)->frag_list;
691 skb_shinfo(skb)->frag_list = NULL;
692 /* BUILD HEADER */
693
9a217a1c 694 *prevhdr = NEXTHDR_FRAGMENT;
d56f90a7 695 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
1da177e4 696 if (!tmp_hdr) {
3bd653c8
DL
697 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
698 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
699 return -ENOMEM;
700 }
701
1da177e4
LT
702 __skb_pull(skb, hlen);
703 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
e2d1bca7
ACM
704 __skb_push(skb, hlen);
705 skb_reset_network_header(skb);
d56f90a7 706 memcpy(skb_network_header(skb), tmp_hdr, hlen);
1da177e4
LT
707
708 ipv6_select_ident(skb, fh);
709 fh->nexthdr = nexthdr;
710 fh->reserved = 0;
711 fh->frag_off = htons(IP6_MF);
712 frag_id = fh->identification;
713
714 first_len = skb_pagelen(skb);
715 skb->data_len = first_len - skb_headlen(skb);
29ffe1a5 716 skb->truesize -= truesizes;
1da177e4 717 skb->len = first_len;
0660e03f
ACM
718 ipv6_hdr(skb)->payload_len = htons(first_len -
719 sizeof(struct ipv6hdr));
a11d206d
YH
720
721 dst_hold(&rt->u.dst);
1da177e4
LT
722
723 for (;;) {
724 /* Prepare header of the next frame,
725 * before previous one went down. */
726 if (frag) {
727 frag->ip_summed = CHECKSUM_NONE;
badff6d0 728 skb_reset_transport_header(frag);
1da177e4 729 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
e2d1bca7
ACM
730 __skb_push(frag, hlen);
731 skb_reset_network_header(frag);
d56f90a7
ACM
732 memcpy(skb_network_header(frag), tmp_hdr,
733 hlen);
1da177e4
LT
734 offset += skb->len - hlen - sizeof(struct frag_hdr);
735 fh->nexthdr = nexthdr;
736 fh->reserved = 0;
737 fh->frag_off = htons(offset);
738 if (frag->next != NULL)
739 fh->frag_off |= htons(IP6_MF);
740 fh->identification = frag_id;
0660e03f
ACM
741 ipv6_hdr(frag)->payload_len =
742 htons(frag->len -
743 sizeof(struct ipv6hdr));
1da177e4
LT
744 ip6_copy_metadata(frag, skb);
745 }
1ab1457c 746
1da177e4 747 err = output(skb);
dafee490 748 if(!err)
3bd653c8
DL
749 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
750 IPSTATS_MIB_FRAGCREATES);
dafee490 751
1da177e4
LT
752 if (err || !frag)
753 break;
754
755 skb = frag;
756 frag = skb->next;
757 skb->next = NULL;
758 }
759
a51482bd 760 kfree(tmp_hdr);
1da177e4
LT
761
762 if (err == 0) {
3bd653c8
DL
763 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
764 IPSTATS_MIB_FRAGOKS);
a11d206d 765 dst_release(&rt->u.dst);
1da177e4
LT
766 return 0;
767 }
768
769 while (frag) {
770 skb = frag->next;
771 kfree_skb(frag);
772 frag = skb;
773 }
774
3bd653c8
DL
775 IP6_INC_STATS(net, ip6_dst_idev(&rt->u.dst),
776 IPSTATS_MIB_FRAGFAILS);
a11d206d 777 dst_release(&rt->u.dst);
1da177e4
LT
778 return err;
779 }
780
781slow_path:
782 left = skb->len - hlen; /* Space per frame */
783 ptr = hlen; /* Where to start from */
784
785 /*
786 * Fragment the datagram.
787 */
788
789 *prevhdr = NEXTHDR_FRAGMENT;
790
791 /*
792 * Keep copying data until we run out.
793 */
794 while(left > 0) {
795 len = left;
796 /* IF: it doesn't fit, use 'mtu' - the data space left */
797 if (len > mtu)
798 len = mtu;
799 /* IF: we are not sending upto and including the packet end
800 then align the next start on an eight byte boundary */
801 if (len < left) {
802 len &= ~7;
803 }
804 /*
805 * Allocate buffer.
806 */
807
f5184d26 808 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_ALLOCATED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
64ce2073 809 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
3bd653c8 810 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
a11d206d 811 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
812 err = -ENOMEM;
813 goto fail;
814 }
815
816 /*
817 * Set up data on packet
818 */
819
820 ip6_copy_metadata(frag, skb);
821 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
822 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
c1d2bbe1 823 skb_reset_network_header(frag);
badff6d0 824 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
b0e380b1
ACM
825 frag->transport_header = (frag->network_header + hlen +
826 sizeof(struct frag_hdr));
1da177e4
LT
827
828 /*
829 * Charge the memory for the fragment to any owner
830 * it might possess
831 */
832 if (skb->sk)
833 skb_set_owner_w(frag, skb->sk);
834
835 /*
836 * Copy the packet header into the new buffer.
837 */
d626f62b 838 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
1da177e4
LT
839
840 /*
841 * Build fragment header.
842 */
843 fh->nexthdr = nexthdr;
844 fh->reserved = 0;
f36d6ab1 845 if (!frag_id) {
1da177e4
LT
846 ipv6_select_ident(skb, fh);
847 frag_id = fh->identification;
848 } else
849 fh->identification = frag_id;
850
851 /*
852 * Copy a block of the IP datagram.
853 */
8984e41d 854 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
1da177e4
LT
855 BUG();
856 left -= len;
857
858 fh->frag_off = htons(offset);
859 if (left > 0)
860 fh->frag_off |= htons(IP6_MF);
0660e03f
ACM
861 ipv6_hdr(frag)->payload_len = htons(frag->len -
862 sizeof(struct ipv6hdr));
1da177e4
LT
863
864 ptr += len;
865 offset += len;
866
867 /*
868 * Put this fragment into the sending queue.
869 */
1da177e4
LT
870 err = output(frag);
871 if (err)
872 goto fail;
dafee490 873
3bd653c8
DL
874 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
875 IPSTATS_MIB_FRAGCREATES);
1da177e4 876 }
3bd653c8 877 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
a11d206d 878 IPSTATS_MIB_FRAGOKS);
1da177e4 879 kfree_skb(skb);
1da177e4
LT
880 return err;
881
882fail:
3bd653c8 883 IP6_INC_STATS(net, ip6_dst_idev(skb->dst),
a11d206d 884 IPSTATS_MIB_FRAGFAILS);
1ab1457c 885 kfree_skb(skb);
1da177e4
LT
886 return err;
887}
888
cf6b1982
YH
889static inline int ip6_rt_check(struct rt6key *rt_key,
890 struct in6_addr *fl_addr,
891 struct in6_addr *addr_cache)
892{
893 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
894 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
895}
896
497c615a
HX
897static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
898 struct dst_entry *dst,
899 struct flowi *fl)
1da177e4 900{
497c615a
HX
901 struct ipv6_pinfo *np = inet6_sk(sk);
902 struct rt6_info *rt = (struct rt6_info *)dst;
1da177e4 903
497c615a
HX
904 if (!dst)
905 goto out;
906
907 /* Yes, checking route validity in not connected
908 * case is not very simple. Take into account,
909 * that we do not support routing by source, TOS,
910 * and MSG_DONTROUTE --ANK (980726)
911 *
cf6b1982
YH
912 * 1. ip6_rt_check(): If route was host route,
913 * check that cached destination is current.
497c615a
HX
914 * If it is network route, we still may
915 * check its validity using saved pointer
916 * to the last used address: daddr_cache.
917 * We do not want to save whole address now,
918 * (because main consumer of this service
919 * is tcp, which has not this problem),
920 * so that the last trick works only on connected
921 * sockets.
922 * 2. oif also should be the same.
923 */
cf6b1982 924 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
8e1ef0a9
YH
925#ifdef CONFIG_IPV6_SUBTREES
926 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
927#endif
cf6b1982 928 (fl->oif && fl->oif != dst->dev->ifindex)) {
497c615a
HX
929 dst_release(dst);
930 dst = NULL;
1da177e4
LT
931 }
932
497c615a
HX
933out:
934 return dst;
935}
936
937static int ip6_dst_lookup_tail(struct sock *sk,
938 struct dst_entry **dst, struct flowi *fl)
939{
940 int err;
3b1e0a65 941 struct net *net = sock_net(sk);
497c615a 942
1da177e4 943 if (*dst == NULL)
8a3edd80 944 *dst = ip6_route_output(net, sk, fl);
1da177e4
LT
945
946 if ((err = (*dst)->error))
947 goto out_err_release;
948
949 if (ipv6_addr_any(&fl->fl6_src)) {
191cd582 950 err = ipv6_dev_get_saddr(net, ip6_dst_idev(*dst)->dev,
7cbca67c
YH
951 &fl->fl6_dst,
952 sk ? inet6_sk(sk)->srcprefs : 0,
953 &fl->fl6_src);
44456d37 954 if (err)
1da177e4 955 goto out_err_release;
1da177e4
LT
956 }
957
95c385b4 958#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
e550dfb0
NH
959 /*
960 * Here if the dst entry we've looked up
961 * has a neighbour entry that is in the INCOMPLETE
962 * state and the src address from the flow is
963 * marked as OPTIMISTIC, we release the found
964 * dst entry and replace it instead with the
965 * dst entry of the nexthop router
966 */
967 if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) {
968 struct inet6_ifaddr *ifp;
969 struct flowi fl_gw;
970 int redirect;
971
972 ifp = ipv6_get_ifaddr(net, &fl->fl6_src,
973 (*dst)->dev, 1);
974
975 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
976 if (ifp)
977 in6_ifa_put(ifp);
978
979 if (redirect) {
980 /*
981 * We need to get the dst entry for the
982 * default router instead
983 */
984 dst_release(*dst);
985 memcpy(&fl_gw, fl, sizeof(struct flowi));
986 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
987 *dst = ip6_route_output(net, sk, &fl_gw);
988 if ((err = (*dst)->error))
989 goto out_err_release;
95c385b4 990 }
e550dfb0 991 }
95c385b4
NH
992#endif
993
1da177e4
LT
994 return 0;
995
996out_err_release:
ca46f9c8 997 if (err == -ENETUNREACH)
483a47d2 998 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
999 dst_release(*dst);
1000 *dst = NULL;
1001 return err;
1002}
34a0b3cd 1003
497c615a
HX
1004/**
1005 * ip6_dst_lookup - perform route lookup on flow
1006 * @sk: socket which provides route info
1007 * @dst: pointer to dst_entry * for result
1008 * @fl: flow to lookup
1009 *
1010 * This function performs a route lookup on the given flow.
1011 *
1012 * It returns zero on success, or a standard errno code on error.
1013 */
1014int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1015{
1016 *dst = NULL;
1017 return ip6_dst_lookup_tail(sk, dst, fl);
1018}
3cf3dc6c
ACM
1019EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1020
497c615a
HX
1021/**
1022 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
1023 * @sk: socket which provides the dst cache and route info
1024 * @dst: pointer to dst_entry * for result
1025 * @fl: flow to lookup
1026 *
1027 * This function performs a route lookup on the given flow with the
1028 * possibility of using the cached route in the socket if it is valid.
1029 * It will take the socket dst lock when operating on the dst cache.
1030 * As a result, this function can only be used in process context.
1031 *
1032 * It returns zero on success, or a standard errno code on error.
1033 */
1034int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1035{
1036 *dst = NULL;
1037 if (sk) {
1038 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1039 *dst = ip6_sk_dst_check(sk, *dst, fl);
1040 }
1041
1042 return ip6_dst_lookup_tail(sk, dst, fl);
1043}
1044EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1045
34a0b3cd 1046static inline int ip6_ufo_append_data(struct sock *sk,
e89e9cf5
AR
1047 int getfrag(void *from, char *to, int offset, int len,
1048 int odd, struct sk_buff *skb),
1049 void *from, int length, int hh_len, int fragheaderlen,
1050 int transhdrlen, int mtu,unsigned int flags)
1051
1052{
1053 struct sk_buff *skb;
1054 int err;
1055
1056 /* There is support for UDP large send offload by network
1057 * device, so create one single skb packet containing complete
1058 * udp datagram
1059 */
1060 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1061 skb = sock_alloc_send_skb(sk,
1062 hh_len + fragheaderlen + transhdrlen + 20,
1063 (flags & MSG_DONTWAIT), &err);
1064 if (skb == NULL)
1065 return -ENOMEM;
1066
1067 /* reserve space for Hardware header */
1068 skb_reserve(skb, hh_len);
1069
1070 /* create space for UDP/IP header */
1071 skb_put(skb,fragheaderlen + transhdrlen);
1072
1073 /* initialize network header pointer */
c1d2bbe1 1074 skb_reset_network_header(skb);
e89e9cf5
AR
1075
1076 /* initialize protocol header pointer */
b0e380b1 1077 skb->transport_header = skb->network_header + fragheaderlen;
e89e9cf5 1078
84fa7933 1079 skb->ip_summed = CHECKSUM_PARTIAL;
e89e9cf5
AR
1080 skb->csum = 0;
1081 sk->sk_sndmsg_off = 0;
1082 }
1083
1084 err = skb_append_datato_frags(sk,skb, getfrag, from,
1085 (length - transhdrlen));
1086 if (!err) {
1087 struct frag_hdr fhdr;
1088
1089 /* specify the length of each IP datagram fragment*/
1ab1457c 1090 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
7967168c 1091 sizeof(struct frag_hdr);
f83ef8c0 1092 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
e89e9cf5
AR
1093 ipv6_select_ident(skb, &fhdr);
1094 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1095 __skb_queue_tail(&sk->sk_write_queue, skb);
1096
1097 return 0;
1098 }
1099 /* There is not enough support do UPD LSO,
1100 * so follow normal path
1101 */
1102 kfree_skb(skb);
1103
1104 return err;
1105}
1da177e4 1106
41a1f8ea
YH
1107int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1108 int offset, int len, int odd, struct sk_buff *skb),
1109 void *from, int length, int transhdrlen,
1110 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1111 struct rt6_info *rt, unsigned int flags)
1da177e4
LT
1112{
1113 struct inet_sock *inet = inet_sk(sk);
1114 struct ipv6_pinfo *np = inet6_sk(sk);
1115 struct sk_buff *skb;
1116 unsigned int maxfraglen, fragheaderlen;
1117 int exthdrlen;
1118 int hh_len;
1119 int mtu;
1120 int copy;
1121 int err;
1122 int offset = 0;
1123 int csummode = CHECKSUM_NONE;
1124
1125 if (flags&MSG_PROBE)
1126 return 0;
1127 if (skb_queue_empty(&sk->sk_write_queue)) {
1128 /*
1129 * setup for corking
1130 */
1131 if (opt) {
1132 if (np->cork.opt == NULL) {
1133 np->cork.opt = kmalloc(opt->tot_len,
1134 sk->sk_allocation);
1135 if (unlikely(np->cork.opt == NULL))
1136 return -ENOBUFS;
1137 } else if (np->cork.opt->tot_len < opt->tot_len) {
1138 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1139 return -EINVAL;
1140 }
1141 memcpy(np->cork.opt, opt, opt->tot_len);
1142 inet->cork.flags |= IPCORK_OPT;
1143 /* need source address above miyazawa*/
1144 }
1145 dst_hold(&rt->u.dst);
c8cdaf99 1146 inet->cork.dst = &rt->u.dst;
1da177e4
LT
1147 inet->cork.fl = *fl;
1148 np->cork.hop_limit = hlimit;
41a1f8ea 1149 np->cork.tclass = tclass;
628a5c56
JH
1150 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1151 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
c7503609 1152 if (np->frag_size < mtu) {
d91675f9
YH
1153 if (np->frag_size)
1154 mtu = np->frag_size;
1155 }
1156 inet->cork.fragsize = mtu;
1da177e4
LT
1157 if (dst_allfrag(rt->u.dst.path))
1158 inet->cork.flags |= IPCORK_ALLFRAG;
1159 inet->cork.length = 0;
1160 sk->sk_sndmsg_page = NULL;
1161 sk->sk_sndmsg_off = 0;
01488942 1162 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
a1b05140 1163 rt->rt6i_nfheader_len;
1da177e4
LT
1164 length += exthdrlen;
1165 transhdrlen += exthdrlen;
1166 } else {
c8cdaf99 1167 rt = (struct rt6_info *)inet->cork.dst;
1da177e4
LT
1168 fl = &inet->cork.fl;
1169 if (inet->cork.flags & IPCORK_OPT)
1170 opt = np->cork.opt;
1171 transhdrlen = 0;
1172 exthdrlen = 0;
1173 mtu = inet->cork.fragsize;
1174 }
1175
1176 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1177
a1b05140 1178 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1179 (opt ? opt->opt_nflen : 0);
1da177e4
LT
1180 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1181
1182 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1183 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1184 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1185 return -EMSGSIZE;
1186 }
1187 }
1188
1189 /*
1190 * Let's try using as much space as possible.
1191 * Use MTU if total length of the message fits into the MTU.
1192 * Otherwise, we need to reserve fragment header and
1193 * fragment alignment (= 8-15 octects, in total).
1194 *
1195 * Note that we may need to "move" the data from the tail of
1ab1457c 1196 * of the buffer to the new fragment when we split
1da177e4
LT
1197 * the message.
1198 *
1ab1457c 1199 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1200 * at once if non-fragmentable extension headers
1201 * are too large.
1ab1457c 1202 * --yoshfuji
1da177e4
LT
1203 */
1204
1205 inet->cork.length += length;
e89e9cf5
AR
1206 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1207 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1208
baa829d8
PM
1209 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1210 fragheaderlen, transhdrlen, mtu,
1211 flags);
1212 if (err)
e89e9cf5 1213 goto error;
e89e9cf5
AR
1214 return 0;
1215 }
1da177e4
LT
1216
1217 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1218 goto alloc_new_skb;
1219
1220 while (length > 0) {
1221 /* Check if the remaining data fits into current packet. */
1222 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1223 if (copy < length)
1224 copy = maxfraglen - skb->len;
1225
1226 if (copy <= 0) {
1227 char *data;
1228 unsigned int datalen;
1229 unsigned int fraglen;
1230 unsigned int fraggap;
1231 unsigned int alloclen;
1232 struct sk_buff *skb_prev;
1233alloc_new_skb:
1234 skb_prev = skb;
1235
1236 /* There's no room in the current skb */
1237 if (skb_prev)
1238 fraggap = skb_prev->len - maxfraglen;
1239 else
1240 fraggap = 0;
1241
1242 /*
1243 * If remaining data exceeds the mtu,
1244 * we know we need more fragment(s).
1245 */
1246 datalen = length + fraggap;
1247 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1248 datalen = maxfraglen - fragheaderlen;
1249
1250 fraglen = datalen + fragheaderlen;
1251 if ((flags & MSG_MORE) &&
1252 !(rt->u.dst.dev->features&NETIF_F_SG))
1253 alloclen = mtu;
1254 else
1255 alloclen = datalen + fragheaderlen;
1256
1257 /*
1258 * The last fragment gets additional space at tail.
1259 * Note: we overallocate on fragments with MSG_MODE
1260 * because we have no idea if we're the last one.
1261 */
1262 if (datalen == length + fraggap)
1263 alloclen += rt->u.dst.trailer_len;
1264
1265 /*
1266 * We just reserve space for fragment header.
1ab1457c 1267 * Note: this may be overallocation if the message
1da177e4
LT
1268 * (without MSG_MORE) fits into the MTU.
1269 */
1270 alloclen += sizeof(struct frag_hdr);
1271
1272 if (transhdrlen) {
1273 skb = sock_alloc_send_skb(sk,
1274 alloclen + hh_len,
1275 (flags & MSG_DONTWAIT), &err);
1276 } else {
1277 skb = NULL;
1278 if (atomic_read(&sk->sk_wmem_alloc) <=
1279 2 * sk->sk_sndbuf)
1280 skb = sock_wmalloc(sk,
1281 alloclen + hh_len, 1,
1282 sk->sk_allocation);
1283 if (unlikely(skb == NULL))
1284 err = -ENOBUFS;
1285 }
1286 if (skb == NULL)
1287 goto error;
1288 /*
1289 * Fill in the control structures
1290 */
1291 skb->ip_summed = csummode;
1292 skb->csum = 0;
1293 /* reserve for fragmentation */
1294 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1295
1296 /*
1297 * Find where to start putting bytes
1298 */
1299 data = skb_put(skb, fraglen);
c14d2450 1300 skb_set_network_header(skb, exthdrlen);
1da177e4 1301 data += fragheaderlen;
b0e380b1
ACM
1302 skb->transport_header = (skb->network_header +
1303 fragheaderlen);
1da177e4
LT
1304 if (fraggap) {
1305 skb->csum = skb_copy_and_csum_bits(
1306 skb_prev, maxfraglen,
1307 data + transhdrlen, fraggap, 0);
1308 skb_prev->csum = csum_sub(skb_prev->csum,
1309 skb->csum);
1310 data += fraggap;
e9fa4f7b 1311 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
1312 }
1313 copy = datalen - transhdrlen - fraggap;
1314 if (copy < 0) {
1315 err = -EINVAL;
1316 kfree_skb(skb);
1317 goto error;
1318 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1319 err = -EFAULT;
1320 kfree_skb(skb);
1321 goto error;
1322 }
1323
1324 offset += copy;
1325 length -= datalen - fraggap;
1326 transhdrlen = 0;
1327 exthdrlen = 0;
1328 csummode = CHECKSUM_NONE;
1329
1330 /*
1331 * Put the packet on the pending queue
1332 */
1333 __skb_queue_tail(&sk->sk_write_queue, skb);
1334 continue;
1335 }
1336
1337 if (copy > length)
1338 copy = length;
1339
1340 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1341 unsigned int off;
1342
1343 off = skb->len;
1344 if (getfrag(from, skb_put(skb, copy),
1345 offset, copy, off, skb) < 0) {
1346 __skb_trim(skb, off);
1347 err = -EFAULT;
1348 goto error;
1349 }
1350 } else {
1351 int i = skb_shinfo(skb)->nr_frags;
1352 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1353 struct page *page = sk->sk_sndmsg_page;
1354 int off = sk->sk_sndmsg_off;
1355 unsigned int left;
1356
1357 if (page && (left = PAGE_SIZE - off) > 0) {
1358 if (copy >= left)
1359 copy = left;
1360 if (page != frag->page) {
1361 if (i == MAX_SKB_FRAGS) {
1362 err = -EMSGSIZE;
1363 goto error;
1364 }
1365 get_page(page);
1366 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1367 frag = &skb_shinfo(skb)->frags[i];
1368 }
1369 } else if(i < MAX_SKB_FRAGS) {
1370 if (copy > PAGE_SIZE)
1371 copy = PAGE_SIZE;
1372 page = alloc_pages(sk->sk_allocation, 0);
1373 if (page == NULL) {
1374 err = -ENOMEM;
1375 goto error;
1376 }
1377 sk->sk_sndmsg_page = page;
1378 sk->sk_sndmsg_off = 0;
1379
1380 skb_fill_page_desc(skb, i, page, 0, 0);
1381 frag = &skb_shinfo(skb)->frags[i];
1da177e4
LT
1382 } else {
1383 err = -EMSGSIZE;
1384 goto error;
1385 }
1386 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1387 err = -EFAULT;
1388 goto error;
1389 }
1390 sk->sk_sndmsg_off += copy;
1391 frag->size += copy;
1392 skb->len += copy;
1393 skb->data_len += copy;
f945fa7a
HX
1394 skb->truesize += copy;
1395 atomic_add(copy, &sk->sk_wmem_alloc);
1da177e4
LT
1396 }
1397 offset += copy;
1398 length -= copy;
1399 }
1400 return 0;
1401error:
1402 inet->cork.length -= length;
3bd653c8 1403 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1404 return err;
1405}
1406
bf138862
PE
1407static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1408{
1409 inet->cork.flags &= ~IPCORK_OPT;
1410 kfree(np->cork.opt);
1411 np->cork.opt = NULL;
c8cdaf99
YH
1412 if (inet->cork.dst) {
1413 dst_release(inet->cork.dst);
1414 inet->cork.dst = NULL;
bf138862
PE
1415 inet->cork.flags &= ~IPCORK_ALLFRAG;
1416 }
1417 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1418}
1419
1da177e4
LT
1420int ip6_push_pending_frames(struct sock *sk)
1421{
1422 struct sk_buff *skb, *tmp_skb;
1423 struct sk_buff **tail_skb;
1424 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1425 struct inet_sock *inet = inet_sk(sk);
1426 struct ipv6_pinfo *np = inet6_sk(sk);
3bd653c8 1427 struct net *net = sock_net(sk);
1da177e4
LT
1428 struct ipv6hdr *hdr;
1429 struct ipv6_txoptions *opt = np->cork.opt;
c8cdaf99 1430 struct rt6_info *rt = (struct rt6_info *)inet->cork.dst;
1da177e4
LT
1431 struct flowi *fl = &inet->cork.fl;
1432 unsigned char proto = fl->proto;
1433 int err = 0;
1434
1435 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1436 goto out;
1437 tail_skb = &(skb_shinfo(skb)->frag_list);
1438
1439 /* move skb->data to ip header from ext header */
d56f90a7 1440 if (skb->data < skb_network_header(skb))
bbe735e4 1441 __skb_pull(skb, skb_network_offset(skb));
1da177e4 1442 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
cfe1fc77 1443 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1444 *tail_skb = tmp_skb;
1445 tail_skb = &(tmp_skb->next);
1446 skb->len += tmp_skb->len;
1447 skb->data_len += tmp_skb->len;
1da177e4
LT
1448 skb->truesize += tmp_skb->truesize;
1449 __sock_put(tmp_skb->sk);
1450 tmp_skb->destructor = NULL;
1451 tmp_skb->sk = NULL;
1da177e4
LT
1452 }
1453
28a89453 1454 /* Allow local fragmentation. */
b5c15fc0 1455 if (np->pmtudisc < IPV6_PMTUDISC_DO)
28a89453
HX
1456 skb->local_df = 1;
1457
1da177e4 1458 ipv6_addr_copy(final_dst, &fl->fl6_dst);
cfe1fc77 1459 __skb_pull(skb, skb_network_header_len(skb));
1da177e4
LT
1460 if (opt && opt->opt_flen)
1461 ipv6_push_frag_opts(skb, opt, &proto);
1462 if (opt && opt->opt_nflen)
1463 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1464
e2d1bca7
ACM
1465 skb_push(skb, sizeof(struct ipv6hdr));
1466 skb_reset_network_header(skb);
0660e03f 1467 hdr = ipv6_hdr(skb);
1ab1457c 1468
90bcaf7b 1469 *(__be32*)hdr = fl->fl6_flowlabel |
41a1f8ea 1470 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1da177e4 1471
1da177e4
LT
1472 hdr->hop_limit = np->cork.hop_limit;
1473 hdr->nexthdr = proto;
1474 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1475 ipv6_addr_copy(&hdr->daddr, final_dst);
1476
a2c2064f 1477 skb->priority = sk->sk_priority;
4a19ec58 1478 skb->mark = sk->sk_mark;
a2c2064f 1479
1da177e4 1480 skb->dst = dst_clone(&rt->u.dst);
3bd653c8 1481 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
14878f75
DS
1482 if (proto == IPPROTO_ICMPV6) {
1483 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1484
5a57d4c7 1485 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
e41b5368 1486 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
14878f75
DS
1487 }
1488
ef76bc23 1489 err = ip6_local_out(skb);
1da177e4
LT
1490 if (err) {
1491 if (err > 0)
3320da89 1492 err = np->recverr ? net_xmit_errno(err) : 0;
1da177e4
LT
1493 if (err)
1494 goto error;
1495 }
1496
1497out:
bf138862 1498 ip6_cork_release(inet, np);
1da177e4
LT
1499 return err;
1500error:
1501 goto out;
1502}
1503
1504void ip6_flush_pending_frames(struct sock *sk)
1505{
1da177e4
LT
1506 struct sk_buff *skb;
1507
1508 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
e1f52208 1509 if (skb->dst)
3bd653c8 1510 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb->dst),
e1f52208 1511 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1512 kfree_skb(skb);
1513 }
1514
bf138862 1515 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1da177e4 1516}