]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv6/ip6_output.c
[UDP]: Revert udplite and code split.
[net-next-2.6.git] / net / ipv6 / ip6_output.c
CommitLineData
1da177e4
LT
1/*
2 * IPv6 output functions
1ab1457c 3 * Linux INET6 implementation
1da177e4
LT
4 *
5 * Authors:
1ab1457c 6 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4
LT
7 *
8 * $Id: ip6_output.c,v 1.34 2002/02/01 22:01:04 davem Exp $
9 *
10 * Based on linux/net/ipv4/ip_output.c
11 *
12 * This program is free software; you can redistribute it and/or
13 * modify it under the terms of the GNU General Public License
14 * as published by the Free Software Foundation; either version
15 * 2 of the License, or (at your option) any later version.
16 *
17 * Changes:
18 * A.N.Kuznetsov : airthmetics in fragmentation.
19 * extension headers are implemented.
20 * route changes now work.
21 * ip6_forward does not confuse sniffers.
22 * etc.
23 *
24 * H. von Brand : Added missing #include <linux/string.h>
25 * Imran Patel : frag id should be in NBO
26 * Kazunori MIYAZAWA @USAGI
27 * : add ip6_append_data and related functions
28 * for datagram xmit
29 */
30
1da177e4 31#include <linux/errno.h>
ef76bc23 32#include <linux/kernel.h>
1da177e4
LT
33#include <linux/string.h>
34#include <linux/socket.h>
35#include <linux/net.h>
36#include <linux/netdevice.h>
37#include <linux/if_arp.h>
38#include <linux/in6.h>
39#include <linux/tcp.h>
40#include <linux/route.h>
b59f45d0 41#include <linux/module.h>
1da177e4
LT
42
43#include <linux/netfilter.h>
44#include <linux/netfilter_ipv6.h>
45
46#include <net/sock.h>
47#include <net/snmp.h>
48
49#include <net/ipv6.h>
50#include <net/ndisc.h>
51#include <net/protocol.h>
52#include <net/ip6_route.h>
53#include <net/addrconf.h>
54#include <net/rawv6.h>
55#include <net/icmp.h>
56#include <net/xfrm.h>
57#include <net/checksum.h>
58
59static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60
61static __inline__ void ipv6_select_ident(struct sk_buff *skb, struct frag_hdr *fhdr)
62{
63 static u32 ipv6_fragmentation_id = 1;
64 static DEFINE_SPINLOCK(ip6_id_lock);
65
66 spin_lock_bh(&ip6_id_lock);
67 fhdr->identification = htonl(ipv6_fragmentation_id);
68 if (++ipv6_fragmentation_id == 0)
69 ipv6_fragmentation_id = 1;
70 spin_unlock_bh(&ip6_id_lock);
71}
72
ef76bc23
HX
73int __ip6_local_out(struct sk_buff *skb)
74{
75 int len;
76
77 len = skb->len - sizeof(struct ipv6hdr);
78 if (len > IPV6_MAXPLEN)
79 len = 0;
80 ipv6_hdr(skb)->payload_len = htons(len);
81
6e23ae2a 82 return nf_hook(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
ef76bc23
HX
83 dst_output);
84}
85
86int ip6_local_out(struct sk_buff *skb)
87{
88 int err;
89
90 err = __ip6_local_out(skb);
91 if (likely(err == 1))
92 err = dst_output(skb);
93
94 return err;
95}
96EXPORT_SYMBOL_GPL(ip6_local_out);
97
ad643a79 98static int ip6_output_finish(struct sk_buff *skb)
1da177e4 99{
1da177e4 100 struct dst_entry *dst = skb->dst;
1da177e4 101
3644f0ce
SH
102 if (dst->hh)
103 return neigh_hh_output(dst->hh, skb);
104 else if (dst->neighbour)
1da177e4
LT
105 return dst->neighbour->output(skb);
106
a11d206d 107 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
108 kfree_skb(skb);
109 return -EINVAL;
110
111}
112
113/* dev_loopback_xmit for use with netfilter. */
114static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
115{
459a98ed 116 skb_reset_mac_header(newskb);
bbe735e4 117 __skb_pull(newskb, skb_network_offset(newskb));
1da177e4
LT
118 newskb->pkt_type = PACKET_LOOPBACK;
119 newskb->ip_summed = CHECKSUM_UNNECESSARY;
120 BUG_TRAP(newskb->dst);
121
122 netif_rx(newskb);
123 return 0;
124}
125
126
127static int ip6_output2(struct sk_buff *skb)
128{
129 struct dst_entry *dst = skb->dst;
130 struct net_device *dev = dst->dev;
131
132 skb->protocol = htons(ETH_P_IPV6);
133 skb->dev = dev;
134
0660e03f 135 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
1da177e4 136 struct ipv6_pinfo* np = skb->sk ? inet6_sk(skb->sk) : NULL;
a11d206d 137 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1da177e4
LT
138
139 if (!(dev->flags & IFF_LOOPBACK) && (!np || np->mc_loop) &&
0660e03f
ACM
140 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
141 &ipv6_hdr(skb)->saddr)) {
1da177e4
LT
142 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
143
144 /* Do not check for IFF_ALLMULTI; multicast routing
145 is not supported in any case.
146 */
147 if (newskb)
6e23ae2a
PM
148 NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, newskb,
149 NULL, newskb->dev,
1da177e4
LT
150 ip6_dev_loopback_xmit);
151
0660e03f 152 if (ipv6_hdr(skb)->hop_limit == 0) {
a11d206d 153 IP6_INC_STATS(idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
154 kfree_skb(skb);
155 return 0;
156 }
157 }
158
a11d206d 159 IP6_INC_STATS(idev, IPSTATS_MIB_OUTMCASTPKTS);
1da177e4
LT
160 }
161
6e23ae2a
PM
162 return NF_HOOK(PF_INET6, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
163 ip6_output_finish);
1da177e4
LT
164}
165
628a5c56
JH
166static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
167{
168 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
169
170 return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
171 skb->dst->dev->mtu : dst_mtu(skb->dst);
172}
173
1da177e4
LT
174int ip6_output(struct sk_buff *skb)
175{
628a5c56 176 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
e89e9cf5 177 dst_allfrag(skb->dst))
1da177e4
LT
178 return ip6_fragment(skb, ip6_output2);
179 else
180 return ip6_output2(skb);
181}
182
1da177e4
LT
183/*
184 * xmit an sk_buff (used by TCP)
185 */
186
187int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl,
188 struct ipv6_txoptions *opt, int ipfragok)
189{
b30bd282 190 struct ipv6_pinfo *np = inet6_sk(sk);
1da177e4
LT
191 struct in6_addr *first_hop = &fl->fl6_dst;
192 struct dst_entry *dst = skb->dst;
193 struct ipv6hdr *hdr;
194 u8 proto = fl->proto;
195 int seg_len = skb->len;
41a1f8ea 196 int hlimit, tclass;
1da177e4
LT
197 u32 mtu;
198
199 if (opt) {
c2636b4d 200 unsigned int head_room;
1da177e4
LT
201
202 /* First: exthdrs may take lots of space (~8K for now)
203 MAX_HEADER is not enough.
204 */
205 head_room = opt->opt_nflen + opt->opt_flen;
206 seg_len += head_room;
207 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
208
209 if (skb_headroom(skb) < head_room) {
210 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
a11d206d
YH
211 if (skb2 == NULL) {
212 IP6_INC_STATS(ip6_dst_idev(skb->dst),
213 IPSTATS_MIB_OUTDISCARDS);
214 kfree_skb(skb);
1da177e4
LT
215 return -ENOBUFS;
216 }
a11d206d
YH
217 kfree_skb(skb);
218 skb = skb2;
1da177e4
LT
219 if (sk)
220 skb_set_owner_w(skb, sk);
221 }
222 if (opt->opt_flen)
223 ipv6_push_frag_opts(skb, opt, &proto);
224 if (opt->opt_nflen)
225 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
226 }
227
e2d1bca7
ACM
228 skb_push(skb, sizeof(struct ipv6hdr));
229 skb_reset_network_header(skb);
0660e03f 230 hdr = ipv6_hdr(skb);
1da177e4
LT
231
232 /*
233 * Fill in the IPv6 header
234 */
235
1da177e4
LT
236 hlimit = -1;
237 if (np)
238 hlimit = np->hop_limit;
239 if (hlimit < 0)
240 hlimit = dst_metric(dst, RTAX_HOPLIMIT);
241 if (hlimit < 0)
242 hlimit = ipv6_get_hoplimit(dst->dev);
243
41a1f8ea
YH
244 tclass = -1;
245 if (np)
246 tclass = np->tclass;
247 if (tclass < 0)
248 tclass = 0;
249
90bcaf7b 250 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl->fl6_flowlabel;
41a1f8ea 251
1da177e4
LT
252 hdr->payload_len = htons(seg_len);
253 hdr->nexthdr = proto;
254 hdr->hop_limit = hlimit;
255
256 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
257 ipv6_addr_copy(&hdr->daddr, first_hop);
258
a2c2064f 259 skb->priority = sk->sk_priority;
4a19ec58 260 skb->mark = sk->sk_mark;
a2c2064f 261
1da177e4 262 mtu = dst_mtu(dst);
89114afd 263 if ((skb->len <= mtu) || ipfragok || skb_is_gso(skb)) {
a11d206d
YH
264 IP6_INC_STATS(ip6_dst_idev(skb->dst),
265 IPSTATS_MIB_OUTREQUESTS);
6e23ae2a 266 return NF_HOOK(PF_INET6, NF_INET_LOCAL_OUT, skb, NULL, dst->dev,
6869c4d8 267 dst_output);
1da177e4
LT
268 }
269
270 if (net_ratelimit())
271 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n");
272 skb->dev = dst->dev;
273 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
a11d206d 274 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
275 kfree_skb(skb);
276 return -EMSGSIZE;
277}
278
7159039a
YH
279EXPORT_SYMBOL(ip6_xmit);
280
1da177e4
LT
281/*
282 * To avoid extra problems ND packets are send through this
283 * routine. It's code duplication but I really want to avoid
284 * extra checks since ipv6_build_header is used by TCP (which
285 * is for us performance critical)
286 */
287
288int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
289 struct in6_addr *saddr, struct in6_addr *daddr,
290 int proto, int len)
291{
292 struct ipv6_pinfo *np = inet6_sk(sk);
293 struct ipv6hdr *hdr;
294 int totlen;
295
296 skb->protocol = htons(ETH_P_IPV6);
297 skb->dev = dev;
298
299 totlen = len + sizeof(struct ipv6hdr);
300
55f79cc0
ACM
301 skb_reset_network_header(skb);
302 skb_put(skb, sizeof(struct ipv6hdr));
0660e03f 303 hdr = ipv6_hdr(skb);
1da177e4 304
ae08e1f0 305 *(__be32*)hdr = htonl(0x60000000);
1da177e4
LT
306
307 hdr->payload_len = htons(len);
308 hdr->nexthdr = proto;
309 hdr->hop_limit = np->hop_limit;
310
311 ipv6_addr_copy(&hdr->saddr, saddr);
312 ipv6_addr_copy(&hdr->daddr, daddr);
313
314 return 0;
315}
316
317static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
318{
319 struct ip6_ra_chain *ra;
320 struct sock *last = NULL;
321
322 read_lock(&ip6_ra_lock);
323 for (ra = ip6_ra_chain; ra; ra = ra->next) {
324 struct sock *sk = ra->sk;
0bd1b59b
AM
325 if (sk && ra->sel == sel &&
326 (!sk->sk_bound_dev_if ||
327 sk->sk_bound_dev_if == skb->dev->ifindex)) {
1da177e4
LT
328 if (last) {
329 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
330 if (skb2)
331 rawv6_rcv(last, skb2);
332 }
333 last = sk;
334 }
335 }
336
337 if (last) {
338 rawv6_rcv(last, skb);
339 read_unlock(&ip6_ra_lock);
340 return 1;
341 }
342 read_unlock(&ip6_ra_lock);
343 return 0;
344}
345
e21e0b5f
VN
346static int ip6_forward_proxy_check(struct sk_buff *skb)
347{
0660e03f 348 struct ipv6hdr *hdr = ipv6_hdr(skb);
e21e0b5f
VN
349 u8 nexthdr = hdr->nexthdr;
350 int offset;
351
352 if (ipv6_ext_hdr(nexthdr)) {
353 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr);
354 if (offset < 0)
355 return 0;
356 } else
357 offset = sizeof(struct ipv6hdr);
358
359 if (nexthdr == IPPROTO_ICMPV6) {
360 struct icmp6hdr *icmp6;
361
d56f90a7
ACM
362 if (!pskb_may_pull(skb, (skb_network_header(skb) +
363 offset + 1 - skb->data)))
e21e0b5f
VN
364 return 0;
365
d56f90a7 366 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
e21e0b5f
VN
367
368 switch (icmp6->icmp6_type) {
369 case NDISC_ROUTER_SOLICITATION:
370 case NDISC_ROUTER_ADVERTISEMENT:
371 case NDISC_NEIGHBOUR_SOLICITATION:
372 case NDISC_NEIGHBOUR_ADVERTISEMENT:
373 case NDISC_REDIRECT:
374 /* For reaction involving unicast neighbor discovery
375 * message destined to the proxied address, pass it to
376 * input function.
377 */
378 return 1;
379 default:
380 break;
381 }
382 }
383
74553b09
VN
384 /*
385 * The proxying router can't forward traffic sent to a link-local
386 * address, so signal the sender and discard the packet. This
387 * behavior is clarified by the MIPv6 specification.
388 */
389 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
390 dst_link_failure(skb);
391 return -1;
392 }
393
e21e0b5f
VN
394 return 0;
395}
396
1da177e4
LT
397static inline int ip6_forward_finish(struct sk_buff *skb)
398{
399 return dst_output(skb);
400}
401
402int ip6_forward(struct sk_buff *skb)
403{
404 struct dst_entry *dst = skb->dst;
0660e03f 405 struct ipv6hdr *hdr = ipv6_hdr(skb);
1da177e4 406 struct inet6_skb_parm *opt = IP6CB(skb);
1ab1457c 407
1da177e4
LT
408 if (ipv6_devconf.forwarding == 0)
409 goto error;
410
411 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
a11d206d 412 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
1da177e4
LT
413 goto drop;
414 }
415
35fc92a9 416 skb_forward_csum(skb);
1da177e4
LT
417
418 /*
419 * We DO NOT make any processing on
420 * RA packets, pushing them to user level AS IS
421 * without ane WARRANTY that application will be able
422 * to interpret them. The reason is that we
423 * cannot make anything clever here.
424 *
425 * We are not end-node, so that if packet contains
426 * AH/ESP, we cannot make anything.
427 * Defragmentation also would be mistake, RA packets
428 * cannot be fragmented, because there is no warranty
429 * that different fragments will go along one path. --ANK
430 */
431 if (opt->ra) {
d56f90a7 432 u8 *ptr = skb_network_header(skb) + opt->ra;
1da177e4
LT
433 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
434 return 0;
435 }
436
437 /*
438 * check and decrement ttl
439 */
440 if (hdr->hop_limit <= 1) {
441 /* Force OUTPUT device used as source address */
442 skb->dev = dst->dev;
443 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT,
444 0, skb->dev);
a11d206d 445 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
1da177e4
LT
446
447 kfree_skb(skb);
448 return -ETIMEDOUT;
449 }
450
fbea49e1
YH
451 /* XXX: idev->cnf.proxy_ndp? */
452 if (ipv6_devconf.proxy_ndp &&
426b5303 453 pneigh_lookup(&nd_tbl, &init_net, &hdr->daddr, skb->dev, 0)) {
74553b09
VN
454 int proxied = ip6_forward_proxy_check(skb);
455 if (proxied > 0)
e21e0b5f 456 return ip6_input(skb);
74553b09 457 else if (proxied < 0) {
a11d206d 458 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
74553b09
VN
459 goto drop;
460 }
e21e0b5f
VN
461 }
462
1da177e4 463 if (!xfrm6_route_forward(skb)) {
a11d206d 464 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
1da177e4
LT
465 goto drop;
466 }
467 dst = skb->dst;
468
469 /* IPv6 specs say nothing about it, but it is clear that we cannot
470 send redirects to source routed frames.
1e5dc146 471 We don't send redirects to frames decapsulated from IPsec.
1da177e4 472 */
1e5dc146
MN
473 if (skb->dev == dst->dev && dst->neighbour && opt->srcrt == 0 &&
474 !skb->sp) {
1da177e4
LT
475 struct in6_addr *target = NULL;
476 struct rt6_info *rt;
477 struct neighbour *n = dst->neighbour;
478
479 /*
480 * incoming and outgoing devices are the same
481 * send a redirect.
482 */
483
484 rt = (struct rt6_info *) dst;
485 if ((rt->rt6i_flags & RTF_GATEWAY))
486 target = (struct in6_addr*)&n->primary_key;
487 else
488 target = &hdr->daddr;
489
490 /* Limit redirects both by destination (here)
491 and by source (inside ndisc_send_redirect)
492 */
493 if (xrlim_allow(dst, 1*HZ))
494 ndisc_send_redirect(skb, n, target);
5bb1ab09
DS
495 } else {
496 int addrtype = ipv6_addr_type(&hdr->saddr);
497
1da177e4 498 /* This check is security critical. */
5bb1ab09
DS
499 if (addrtype & (IPV6_ADDR_MULTICAST|IPV6_ADDR_LOOPBACK))
500 goto error;
501 if (addrtype & IPV6_ADDR_LINKLOCAL) {
502 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
503 ICMPV6_NOT_NEIGHBOUR, 0, skb->dev);
504 goto error;
505 }
1da177e4
LT
506 }
507
508 if (skb->len > dst_mtu(dst)) {
509 /* Again, force OUTPUT device used as source address */
510 skb->dev = dst->dev;
511 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, dst_mtu(dst), skb->dev);
a11d206d
YH
512 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
513 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
514 kfree_skb(skb);
515 return -EMSGSIZE;
516 }
517
518 if (skb_cow(skb, dst->dev->hard_header_len)) {
a11d206d 519 IP6_INC_STATS(ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
520 goto drop;
521 }
522
0660e03f 523 hdr = ipv6_hdr(skb);
1da177e4
LT
524
525 /* Mangling hops number delayed to point after skb COW */
1ab1457c 526
1da177e4
LT
527 hdr->hop_limit--;
528
a11d206d 529 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
6e23ae2a
PM
530 return NF_HOOK(PF_INET6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
531 ip6_forward_finish);
1da177e4
LT
532
533error:
a11d206d 534 IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
1da177e4
LT
535drop:
536 kfree_skb(skb);
537 return -EINVAL;
538}
539
540static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
541{
542 to->pkt_type = from->pkt_type;
543 to->priority = from->priority;
544 to->protocol = from->protocol;
1da177e4
LT
545 dst_release(to->dst);
546 to->dst = dst_clone(from->dst);
547 to->dev = from->dev;
82e91ffe 548 to->mark = from->mark;
1da177e4
LT
549
550#ifdef CONFIG_NET_SCHED
551 to->tc_index = from->tc_index;
552#endif
e7ac05f3 553 nf_copy(to, from);
ba9dda3a
JK
554#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
555 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
556 to->nf_trace = from->nf_trace;
557#endif
984bc16c 558 skb_copy_secmark(to, from);
1da177e4
LT
559}
560
561int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
562{
563 u16 offset = sizeof(struct ipv6hdr);
0660e03f
ACM
564 struct ipv6_opt_hdr *exthdr =
565 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
27a884dc 566 unsigned int packet_len = skb->tail - skb->network_header;
1da177e4 567 int found_rhdr = 0;
0660e03f 568 *nexthdr = &ipv6_hdr(skb)->nexthdr;
1da177e4
LT
569
570 while (offset + 1 <= packet_len) {
571
572 switch (**nexthdr) {
573
574 case NEXTHDR_HOP:
27637df9 575 break;
1da177e4 576 case NEXTHDR_ROUTING:
27637df9
MN
577 found_rhdr = 1;
578 break;
1da177e4 579 case NEXTHDR_DEST:
59fbb3a6 580#if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
27637df9
MN
581 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
582 break;
583#endif
584 if (found_rhdr)
585 return offset;
1da177e4
LT
586 break;
587 default :
588 return offset;
589 }
27637df9
MN
590
591 offset += ipv6_optlen(exthdr);
592 *nexthdr = &exthdr->nexthdr;
d56f90a7
ACM
593 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
594 offset);
1da177e4
LT
595 }
596
597 return offset;
598}
599
600static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
601{
602 struct net_device *dev;
603 struct sk_buff *frag;
604 struct rt6_info *rt = (struct rt6_info*)skb->dst;
d91675f9 605 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
1da177e4
LT
606 struct ipv6hdr *tmp_hdr;
607 struct frag_hdr *fh;
608 unsigned int mtu, hlen, left, len;
ae08e1f0 609 __be32 frag_id = 0;
1da177e4
LT
610 int ptr, offset = 0, err=0;
611 u8 *prevhdr, nexthdr = 0;
612
613 dev = rt->u.dst.dev;
614 hlen = ip6_find_1stfragopt(skb, &prevhdr);
615 nexthdr = *prevhdr;
616
628a5c56 617 mtu = ip6_skb_dst_mtu(skb);
b881ef76
JH
618
619 /* We must not fragment if the socket is set to force MTU discovery
620 * or if the skb it not generated by a local socket. (This last
621 * check should be redundant, but it's free.)
622 */
b5c15fc0 623 if (!skb->local_df) {
b881ef76
JH
624 skb->dev = skb->dst->dev;
625 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu, skb->dev);
626 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
627 kfree_skb(skb);
628 return -EMSGSIZE;
629 }
630
d91675f9
YH
631 if (np && np->frag_size < mtu) {
632 if (np->frag_size)
633 mtu = np->frag_size;
634 }
635 mtu -= hlen + sizeof(struct frag_hdr);
1da177e4
LT
636
637 if (skb_shinfo(skb)->frag_list) {
638 int first_len = skb_pagelen(skb);
29ffe1a5 639 int truesizes = 0;
1da177e4
LT
640
641 if (first_len - hlen > mtu ||
642 ((first_len - hlen) & 7) ||
643 skb_cloned(skb))
644 goto slow_path;
645
646 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
647 /* Correct geometry. */
648 if (frag->len > mtu ||
649 ((frag->len & 7) && frag->next) ||
650 skb_headroom(frag) < hlen)
651 goto slow_path;
652
1da177e4
LT
653 /* Partially cloned skb? */
654 if (skb_shared(frag))
655 goto slow_path;
2fdba6b0
HX
656
657 BUG_ON(frag->sk);
658 if (skb->sk) {
659 sock_hold(skb->sk);
660 frag->sk = skb->sk;
661 frag->destructor = sock_wfree;
29ffe1a5 662 truesizes += frag->truesize;
2fdba6b0 663 }
1da177e4
LT
664 }
665
666 err = 0;
667 offset = 0;
668 frag = skb_shinfo(skb)->frag_list;
669 skb_shinfo(skb)->frag_list = NULL;
670 /* BUILD HEADER */
671
9a217a1c 672 *prevhdr = NEXTHDR_FRAGMENT;
d56f90a7 673 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
1da177e4 674 if (!tmp_hdr) {
a11d206d 675 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
676 return -ENOMEM;
677 }
678
1da177e4
LT
679 __skb_pull(skb, hlen);
680 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
e2d1bca7
ACM
681 __skb_push(skb, hlen);
682 skb_reset_network_header(skb);
d56f90a7 683 memcpy(skb_network_header(skb), tmp_hdr, hlen);
1da177e4
LT
684
685 ipv6_select_ident(skb, fh);
686 fh->nexthdr = nexthdr;
687 fh->reserved = 0;
688 fh->frag_off = htons(IP6_MF);
689 frag_id = fh->identification;
690
691 first_len = skb_pagelen(skb);
692 skb->data_len = first_len - skb_headlen(skb);
29ffe1a5 693 skb->truesize -= truesizes;
1da177e4 694 skb->len = first_len;
0660e03f
ACM
695 ipv6_hdr(skb)->payload_len = htons(first_len -
696 sizeof(struct ipv6hdr));
a11d206d
YH
697
698 dst_hold(&rt->u.dst);
1da177e4
LT
699
700 for (;;) {
701 /* Prepare header of the next frame,
702 * before previous one went down. */
703 if (frag) {
704 frag->ip_summed = CHECKSUM_NONE;
badff6d0 705 skb_reset_transport_header(frag);
1da177e4 706 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
e2d1bca7
ACM
707 __skb_push(frag, hlen);
708 skb_reset_network_header(frag);
d56f90a7
ACM
709 memcpy(skb_network_header(frag), tmp_hdr,
710 hlen);
1da177e4
LT
711 offset += skb->len - hlen - sizeof(struct frag_hdr);
712 fh->nexthdr = nexthdr;
713 fh->reserved = 0;
714 fh->frag_off = htons(offset);
715 if (frag->next != NULL)
716 fh->frag_off |= htons(IP6_MF);
717 fh->identification = frag_id;
0660e03f
ACM
718 ipv6_hdr(frag)->payload_len =
719 htons(frag->len -
720 sizeof(struct ipv6hdr));
1da177e4
LT
721 ip6_copy_metadata(frag, skb);
722 }
1ab1457c 723
1da177e4 724 err = output(skb);
dafee490 725 if(!err)
a11d206d 726 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGCREATES);
dafee490 727
1da177e4
LT
728 if (err || !frag)
729 break;
730
731 skb = frag;
732 frag = skb->next;
733 skb->next = NULL;
734 }
735
a51482bd 736 kfree(tmp_hdr);
1da177e4
LT
737
738 if (err == 0) {
a11d206d
YH
739 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGOKS);
740 dst_release(&rt->u.dst);
1da177e4
LT
741 return 0;
742 }
743
744 while (frag) {
745 skb = frag->next;
746 kfree_skb(frag);
747 frag = skb;
748 }
749
a11d206d
YH
750 IP6_INC_STATS(ip6_dst_idev(&rt->u.dst), IPSTATS_MIB_FRAGFAILS);
751 dst_release(&rt->u.dst);
1da177e4
LT
752 return err;
753 }
754
755slow_path:
756 left = skb->len - hlen; /* Space per frame */
757 ptr = hlen; /* Where to start from */
758
759 /*
760 * Fragment the datagram.
761 */
762
763 *prevhdr = NEXTHDR_FRAGMENT;
764
765 /*
766 * Keep copying data until we run out.
767 */
768 while(left > 0) {
769 len = left;
770 /* IF: it doesn't fit, use 'mtu' - the data space left */
771 if (len > mtu)
772 len = mtu;
773 /* IF: we are not sending upto and including the packet end
774 then align the next start on an eight byte boundary */
775 if (len < left) {
776 len &= ~7;
777 }
778 /*
779 * Allocate buffer.
780 */
781
782 if ((frag = alloc_skb(len+hlen+sizeof(struct frag_hdr)+LL_RESERVED_SPACE(rt->u.dst.dev), GFP_ATOMIC)) == NULL) {
64ce2073 783 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
a11d206d
YH
784 IP6_INC_STATS(ip6_dst_idev(skb->dst),
785 IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
786 err = -ENOMEM;
787 goto fail;
788 }
789
790 /*
791 * Set up data on packet
792 */
793
794 ip6_copy_metadata(frag, skb);
795 skb_reserve(frag, LL_RESERVED_SPACE(rt->u.dst.dev));
796 skb_put(frag, len + hlen + sizeof(struct frag_hdr));
c1d2bbe1 797 skb_reset_network_header(frag);
badff6d0 798 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
b0e380b1
ACM
799 frag->transport_header = (frag->network_header + hlen +
800 sizeof(struct frag_hdr));
1da177e4
LT
801
802 /*
803 * Charge the memory for the fragment to any owner
804 * it might possess
805 */
806 if (skb->sk)
807 skb_set_owner_w(frag, skb->sk);
808
809 /*
810 * Copy the packet header into the new buffer.
811 */
d626f62b 812 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
1da177e4
LT
813
814 /*
815 * Build fragment header.
816 */
817 fh->nexthdr = nexthdr;
818 fh->reserved = 0;
f36d6ab1 819 if (!frag_id) {
1da177e4
LT
820 ipv6_select_ident(skb, fh);
821 frag_id = fh->identification;
822 } else
823 fh->identification = frag_id;
824
825 /*
826 * Copy a block of the IP datagram.
827 */
8984e41d 828 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
1da177e4
LT
829 BUG();
830 left -= len;
831
832 fh->frag_off = htons(offset);
833 if (left > 0)
834 fh->frag_off |= htons(IP6_MF);
0660e03f
ACM
835 ipv6_hdr(frag)->payload_len = htons(frag->len -
836 sizeof(struct ipv6hdr));
1da177e4
LT
837
838 ptr += len;
839 offset += len;
840
841 /*
842 * Put this fragment into the sending queue.
843 */
1da177e4
LT
844 err = output(frag);
845 if (err)
846 goto fail;
dafee490 847
a11d206d 848 IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_FRAGCREATES);
1da177e4 849 }
a11d206d
YH
850 IP6_INC_STATS(ip6_dst_idev(skb->dst),
851 IPSTATS_MIB_FRAGOKS);
1da177e4 852 kfree_skb(skb);
1da177e4
LT
853 return err;
854
855fail:
a11d206d
YH
856 IP6_INC_STATS(ip6_dst_idev(skb->dst),
857 IPSTATS_MIB_FRAGFAILS);
1ab1457c 858 kfree_skb(skb);
1da177e4
LT
859 return err;
860}
861
cf6b1982
YH
862static inline int ip6_rt_check(struct rt6key *rt_key,
863 struct in6_addr *fl_addr,
864 struct in6_addr *addr_cache)
865{
866 return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
867 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)));
868}
869
497c615a
HX
870static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
871 struct dst_entry *dst,
872 struct flowi *fl)
1da177e4 873{
497c615a
HX
874 struct ipv6_pinfo *np = inet6_sk(sk);
875 struct rt6_info *rt = (struct rt6_info *)dst;
1da177e4 876
497c615a
HX
877 if (!dst)
878 goto out;
879
880 /* Yes, checking route validity in not connected
881 * case is not very simple. Take into account,
882 * that we do not support routing by source, TOS,
883 * and MSG_DONTROUTE --ANK (980726)
884 *
cf6b1982
YH
885 * 1. ip6_rt_check(): If route was host route,
886 * check that cached destination is current.
497c615a
HX
887 * If it is network route, we still may
888 * check its validity using saved pointer
889 * to the last used address: daddr_cache.
890 * We do not want to save whole address now,
891 * (because main consumer of this service
892 * is tcp, which has not this problem),
893 * so that the last trick works only on connected
894 * sockets.
895 * 2. oif also should be the same.
896 */
cf6b1982 897 if (ip6_rt_check(&rt->rt6i_dst, &fl->fl6_dst, np->daddr_cache) ||
8e1ef0a9
YH
898#ifdef CONFIG_IPV6_SUBTREES
899 ip6_rt_check(&rt->rt6i_src, &fl->fl6_src, np->saddr_cache) ||
900#endif
cf6b1982 901 (fl->oif && fl->oif != dst->dev->ifindex)) {
497c615a
HX
902 dst_release(dst);
903 dst = NULL;
1da177e4
LT
904 }
905
497c615a
HX
906out:
907 return dst;
908}
909
910static int ip6_dst_lookup_tail(struct sock *sk,
911 struct dst_entry **dst, struct flowi *fl)
912{
913 int err;
914
1da177e4 915 if (*dst == NULL)
c20121ae 916 *dst = ip6_route_output(sk->sk_net, sk, fl);
1da177e4
LT
917
918 if ((err = (*dst)->error))
919 goto out_err_release;
920
921 if (ipv6_addr_any(&fl->fl6_src)) {
5e5f3f0f
YH
922 err = ipv6_dev_get_saddr(ip6_dst_idev(*dst)->dev,
923 &fl->fl6_dst, &fl->fl6_src);
44456d37 924 if (err)
1da177e4 925 goto out_err_release;
1da177e4
LT
926 }
927
95c385b4
NH
928#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
929 /*
930 * Here if the dst entry we've looked up
931 * has a neighbour entry that is in the INCOMPLETE
932 * state and the src address from the flow is
933 * marked as OPTIMISTIC, we release the found
934 * dst entry and replace it instead with the
935 * dst entry of the nexthop router
936 */
937 if (!((*dst)->neighbour->nud_state & NUD_VALID)) {
938 struct inet6_ifaddr *ifp;
939 struct flowi fl_gw;
940 int redirect;
941
1cab3da6
DL
942 ifp = ipv6_get_ifaddr(&init_net, &fl->fl6_src,
943 (*dst)->dev, 1);
95c385b4
NH
944
945 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
946 if (ifp)
947 in6_ifa_put(ifp);
948
949 if (redirect) {
950 /*
951 * We need to get the dst entry for the
952 * default router instead
953 */
954 dst_release(*dst);
955 memcpy(&fl_gw, fl, sizeof(struct flowi));
956 memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr));
c20121ae 957 *dst = ip6_route_output(sk->sk_net, sk, &fl_gw);
95c385b4
NH
958 if ((err = (*dst)->error))
959 goto out_err_release;
960 }
961 }
962#endif
963
1da177e4
LT
964 return 0;
965
966out_err_release:
ca46f9c8
MC
967 if (err == -ENETUNREACH)
968 IP6_INC_STATS_BH(NULL, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
969 dst_release(*dst);
970 *dst = NULL;
971 return err;
972}
34a0b3cd 973
497c615a
HX
974/**
975 * ip6_dst_lookup - perform route lookup on flow
976 * @sk: socket which provides route info
977 * @dst: pointer to dst_entry * for result
978 * @fl: flow to lookup
979 *
980 * This function performs a route lookup on the given flow.
981 *
982 * It returns zero on success, or a standard errno code on error.
983 */
984int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
985{
986 *dst = NULL;
987 return ip6_dst_lookup_tail(sk, dst, fl);
988}
3cf3dc6c
ACM
989EXPORT_SYMBOL_GPL(ip6_dst_lookup);
990
497c615a
HX
991/**
992 * ip6_sk_dst_lookup - perform socket cached route lookup on flow
993 * @sk: socket which provides the dst cache and route info
994 * @dst: pointer to dst_entry * for result
995 * @fl: flow to lookup
996 *
997 * This function performs a route lookup on the given flow with the
998 * possibility of using the cached route in the socket if it is valid.
999 * It will take the socket dst lock when operating on the dst cache.
1000 * As a result, this function can only be used in process context.
1001 *
1002 * It returns zero on success, or a standard errno code on error.
1003 */
1004int ip6_sk_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi *fl)
1005{
1006 *dst = NULL;
1007 if (sk) {
1008 *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1009 *dst = ip6_sk_dst_check(sk, *dst, fl);
1010 }
1011
1012 return ip6_dst_lookup_tail(sk, dst, fl);
1013}
1014EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup);
1015
34a0b3cd 1016static inline int ip6_ufo_append_data(struct sock *sk,
e89e9cf5
AR
1017 int getfrag(void *from, char *to, int offset, int len,
1018 int odd, struct sk_buff *skb),
1019 void *from, int length, int hh_len, int fragheaderlen,
1020 int transhdrlen, int mtu,unsigned int flags)
1021
1022{
1023 struct sk_buff *skb;
1024 int err;
1025
1026 /* There is support for UDP large send offload by network
1027 * device, so create one single skb packet containing complete
1028 * udp datagram
1029 */
1030 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1031 skb = sock_alloc_send_skb(sk,
1032 hh_len + fragheaderlen + transhdrlen + 20,
1033 (flags & MSG_DONTWAIT), &err);
1034 if (skb == NULL)
1035 return -ENOMEM;
1036
1037 /* reserve space for Hardware header */
1038 skb_reserve(skb, hh_len);
1039
1040 /* create space for UDP/IP header */
1041 skb_put(skb,fragheaderlen + transhdrlen);
1042
1043 /* initialize network header pointer */
c1d2bbe1 1044 skb_reset_network_header(skb);
e89e9cf5
AR
1045
1046 /* initialize protocol header pointer */
b0e380b1 1047 skb->transport_header = skb->network_header + fragheaderlen;
e89e9cf5 1048
84fa7933 1049 skb->ip_summed = CHECKSUM_PARTIAL;
e89e9cf5
AR
1050 skb->csum = 0;
1051 sk->sk_sndmsg_off = 0;
1052 }
1053
1054 err = skb_append_datato_frags(sk,skb, getfrag, from,
1055 (length - transhdrlen));
1056 if (!err) {
1057 struct frag_hdr fhdr;
1058
1059 /* specify the length of each IP datagram fragment*/
1ab1457c 1060 skb_shinfo(skb)->gso_size = mtu - fragheaderlen -
7967168c 1061 sizeof(struct frag_hdr);
f83ef8c0 1062 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
e89e9cf5
AR
1063 ipv6_select_ident(skb, &fhdr);
1064 skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1065 __skb_queue_tail(&sk->sk_write_queue, skb);
1066
1067 return 0;
1068 }
1069 /* There is not enough support do UPD LSO,
1070 * so follow normal path
1071 */
1072 kfree_skb(skb);
1073
1074 return err;
1075}
1da177e4 1076
41a1f8ea
YH
1077int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1078 int offset, int len, int odd, struct sk_buff *skb),
1079 void *from, int length, int transhdrlen,
1080 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi *fl,
1081 struct rt6_info *rt, unsigned int flags)
1da177e4
LT
1082{
1083 struct inet_sock *inet = inet_sk(sk);
1084 struct ipv6_pinfo *np = inet6_sk(sk);
1085 struct sk_buff *skb;
1086 unsigned int maxfraglen, fragheaderlen;
1087 int exthdrlen;
1088 int hh_len;
1089 int mtu;
1090 int copy;
1091 int err;
1092 int offset = 0;
1093 int csummode = CHECKSUM_NONE;
1094
1095 if (flags&MSG_PROBE)
1096 return 0;
1097 if (skb_queue_empty(&sk->sk_write_queue)) {
1098 /*
1099 * setup for corking
1100 */
1101 if (opt) {
1102 if (np->cork.opt == NULL) {
1103 np->cork.opt = kmalloc(opt->tot_len,
1104 sk->sk_allocation);
1105 if (unlikely(np->cork.opt == NULL))
1106 return -ENOBUFS;
1107 } else if (np->cork.opt->tot_len < opt->tot_len) {
1108 printk(KERN_DEBUG "ip6_append_data: invalid option length\n");
1109 return -EINVAL;
1110 }
1111 memcpy(np->cork.opt, opt, opt->tot_len);
1112 inet->cork.flags |= IPCORK_OPT;
1113 /* need source address above miyazawa*/
1114 }
1115 dst_hold(&rt->u.dst);
1116 np->cork.rt = rt;
1117 inet->cork.fl = *fl;
1118 np->cork.hop_limit = hlimit;
41a1f8ea 1119 np->cork.tclass = tclass;
628a5c56
JH
1120 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1121 rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
c7503609 1122 if (np->frag_size < mtu) {
d91675f9
YH
1123 if (np->frag_size)
1124 mtu = np->frag_size;
1125 }
1126 inet->cork.fragsize = mtu;
1da177e4
LT
1127 if (dst_allfrag(rt->u.dst.path))
1128 inet->cork.flags |= IPCORK_ALLFRAG;
1129 inet->cork.length = 0;
1130 sk->sk_sndmsg_page = NULL;
1131 sk->sk_sndmsg_off = 0;
01488942 1132 exthdrlen = rt->u.dst.header_len + (opt ? opt->opt_flen : 0) -
a1b05140 1133 rt->rt6i_nfheader_len;
1da177e4
LT
1134 length += exthdrlen;
1135 transhdrlen += exthdrlen;
1136 } else {
1137 rt = np->cork.rt;
1138 fl = &inet->cork.fl;
1139 if (inet->cork.flags & IPCORK_OPT)
1140 opt = np->cork.opt;
1141 transhdrlen = 0;
1142 exthdrlen = 0;
1143 mtu = inet->cork.fragsize;
1144 }
1145
1146 hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1147
a1b05140 1148 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
b4ce9277 1149 (opt ? opt->opt_nflen : 0);
1da177e4
LT
1150 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1151
1152 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1153 if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1154 ipv6_local_error(sk, EMSGSIZE, fl, mtu-exthdrlen);
1155 return -EMSGSIZE;
1156 }
1157 }
1158
1159 /*
1160 * Let's try using as much space as possible.
1161 * Use MTU if total length of the message fits into the MTU.
1162 * Otherwise, we need to reserve fragment header and
1163 * fragment alignment (= 8-15 octects, in total).
1164 *
1165 * Note that we may need to "move" the data from the tail of
1ab1457c 1166 * of the buffer to the new fragment when we split
1da177e4
LT
1167 * the message.
1168 *
1ab1457c 1169 * FIXME: It may be fragmented into multiple chunks
1da177e4
LT
1170 * at once if non-fragmentable extension headers
1171 * are too large.
1ab1457c 1172 * --yoshfuji
1da177e4
LT
1173 */
1174
1175 inet->cork.length += length;
e89e9cf5
AR
1176 if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
1177 (rt->u.dst.dev->features & NETIF_F_UFO)) {
1178
baa829d8
PM
1179 err = ip6_ufo_append_data(sk, getfrag, from, length, hh_len,
1180 fragheaderlen, transhdrlen, mtu,
1181 flags);
1182 if (err)
e89e9cf5 1183 goto error;
e89e9cf5
AR
1184 return 0;
1185 }
1da177e4
LT
1186
1187 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1188 goto alloc_new_skb;
1189
1190 while (length > 0) {
1191 /* Check if the remaining data fits into current packet. */
1192 copy = (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1193 if (copy < length)
1194 copy = maxfraglen - skb->len;
1195
1196 if (copy <= 0) {
1197 char *data;
1198 unsigned int datalen;
1199 unsigned int fraglen;
1200 unsigned int fraggap;
1201 unsigned int alloclen;
1202 struct sk_buff *skb_prev;
1203alloc_new_skb:
1204 skb_prev = skb;
1205
1206 /* There's no room in the current skb */
1207 if (skb_prev)
1208 fraggap = skb_prev->len - maxfraglen;
1209 else
1210 fraggap = 0;
1211
1212 /*
1213 * If remaining data exceeds the mtu,
1214 * we know we need more fragment(s).
1215 */
1216 datalen = length + fraggap;
1217 if (datalen > (inet->cork.length <= mtu && !(inet->cork.flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1218 datalen = maxfraglen - fragheaderlen;
1219
1220 fraglen = datalen + fragheaderlen;
1221 if ((flags & MSG_MORE) &&
1222 !(rt->u.dst.dev->features&NETIF_F_SG))
1223 alloclen = mtu;
1224 else
1225 alloclen = datalen + fragheaderlen;
1226
1227 /*
1228 * The last fragment gets additional space at tail.
1229 * Note: we overallocate on fragments with MSG_MODE
1230 * because we have no idea if we're the last one.
1231 */
1232 if (datalen == length + fraggap)
1233 alloclen += rt->u.dst.trailer_len;
1234
1235 /*
1236 * We just reserve space for fragment header.
1ab1457c 1237 * Note: this may be overallocation if the message
1da177e4
LT
1238 * (without MSG_MORE) fits into the MTU.
1239 */
1240 alloclen += sizeof(struct frag_hdr);
1241
1242 if (transhdrlen) {
1243 skb = sock_alloc_send_skb(sk,
1244 alloclen + hh_len,
1245 (flags & MSG_DONTWAIT), &err);
1246 } else {
1247 skb = NULL;
1248 if (atomic_read(&sk->sk_wmem_alloc) <=
1249 2 * sk->sk_sndbuf)
1250 skb = sock_wmalloc(sk,
1251 alloclen + hh_len, 1,
1252 sk->sk_allocation);
1253 if (unlikely(skb == NULL))
1254 err = -ENOBUFS;
1255 }
1256 if (skb == NULL)
1257 goto error;
1258 /*
1259 * Fill in the control structures
1260 */
1261 skb->ip_summed = csummode;
1262 skb->csum = 0;
1263 /* reserve for fragmentation */
1264 skb_reserve(skb, hh_len+sizeof(struct frag_hdr));
1265
1266 /*
1267 * Find where to start putting bytes
1268 */
1269 data = skb_put(skb, fraglen);
c14d2450 1270 skb_set_network_header(skb, exthdrlen);
1da177e4 1271 data += fragheaderlen;
b0e380b1
ACM
1272 skb->transport_header = (skb->network_header +
1273 fragheaderlen);
1da177e4
LT
1274 if (fraggap) {
1275 skb->csum = skb_copy_and_csum_bits(
1276 skb_prev, maxfraglen,
1277 data + transhdrlen, fraggap, 0);
1278 skb_prev->csum = csum_sub(skb_prev->csum,
1279 skb->csum);
1280 data += fraggap;
e9fa4f7b 1281 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
1282 }
1283 copy = datalen - transhdrlen - fraggap;
1284 if (copy < 0) {
1285 err = -EINVAL;
1286 kfree_skb(skb);
1287 goto error;
1288 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1289 err = -EFAULT;
1290 kfree_skb(skb);
1291 goto error;
1292 }
1293
1294 offset += copy;
1295 length -= datalen - fraggap;
1296 transhdrlen = 0;
1297 exthdrlen = 0;
1298 csummode = CHECKSUM_NONE;
1299
1300 /*
1301 * Put the packet on the pending queue
1302 */
1303 __skb_queue_tail(&sk->sk_write_queue, skb);
1304 continue;
1305 }
1306
1307 if (copy > length)
1308 copy = length;
1309
1310 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
1311 unsigned int off;
1312
1313 off = skb->len;
1314 if (getfrag(from, skb_put(skb, copy),
1315 offset, copy, off, skb) < 0) {
1316 __skb_trim(skb, off);
1317 err = -EFAULT;
1318 goto error;
1319 }
1320 } else {
1321 int i = skb_shinfo(skb)->nr_frags;
1322 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1323 struct page *page = sk->sk_sndmsg_page;
1324 int off = sk->sk_sndmsg_off;
1325 unsigned int left;
1326
1327 if (page && (left = PAGE_SIZE - off) > 0) {
1328 if (copy >= left)
1329 copy = left;
1330 if (page != frag->page) {
1331 if (i == MAX_SKB_FRAGS) {
1332 err = -EMSGSIZE;
1333 goto error;
1334 }
1335 get_page(page);
1336 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1337 frag = &skb_shinfo(skb)->frags[i];
1338 }
1339 } else if(i < MAX_SKB_FRAGS) {
1340 if (copy > PAGE_SIZE)
1341 copy = PAGE_SIZE;
1342 page = alloc_pages(sk->sk_allocation, 0);
1343 if (page == NULL) {
1344 err = -ENOMEM;
1345 goto error;
1346 }
1347 sk->sk_sndmsg_page = page;
1348 sk->sk_sndmsg_off = 0;
1349
1350 skb_fill_page_desc(skb, i, page, 0, 0);
1351 frag = &skb_shinfo(skb)->frags[i];
1da177e4
LT
1352 } else {
1353 err = -EMSGSIZE;
1354 goto error;
1355 }
1356 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1357 err = -EFAULT;
1358 goto error;
1359 }
1360 sk->sk_sndmsg_off += copy;
1361 frag->size += copy;
1362 skb->len += copy;
1363 skb->data_len += copy;
f945fa7a
HX
1364 skb->truesize += copy;
1365 atomic_add(copy, &sk->sk_wmem_alloc);
1da177e4
LT
1366 }
1367 offset += copy;
1368 length -= copy;
1369 }
1370 return 0;
1371error:
1372 inet->cork.length -= length;
a11d206d 1373 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1374 return err;
1375}
1376
bf138862
PE
1377static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1378{
1379 inet->cork.flags &= ~IPCORK_OPT;
1380 kfree(np->cork.opt);
1381 np->cork.opt = NULL;
1382 if (np->cork.rt) {
1383 dst_release(&np->cork.rt->u.dst);
1384 np->cork.rt = NULL;
1385 inet->cork.flags &= ~IPCORK_ALLFRAG;
1386 }
1387 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1388}
1389
1da177e4
LT
1390int ip6_push_pending_frames(struct sock *sk)
1391{
1392 struct sk_buff *skb, *tmp_skb;
1393 struct sk_buff **tail_skb;
1394 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1395 struct inet_sock *inet = inet_sk(sk);
1396 struct ipv6_pinfo *np = inet6_sk(sk);
1397 struct ipv6hdr *hdr;
1398 struct ipv6_txoptions *opt = np->cork.opt;
1399 struct rt6_info *rt = np->cork.rt;
1400 struct flowi *fl = &inet->cork.fl;
1401 unsigned char proto = fl->proto;
1402 int err = 0;
1403
1404 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1405 goto out;
1406 tail_skb = &(skb_shinfo(skb)->frag_list);
1407
1408 /* move skb->data to ip header from ext header */
d56f90a7 1409 if (skb->data < skb_network_header(skb))
bbe735e4 1410 __skb_pull(skb, skb_network_offset(skb));
1da177e4 1411 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
cfe1fc77 1412 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1413 *tail_skb = tmp_skb;
1414 tail_skb = &(tmp_skb->next);
1415 skb->len += tmp_skb->len;
1416 skb->data_len += tmp_skb->len;
1da177e4
LT
1417 skb->truesize += tmp_skb->truesize;
1418 __sock_put(tmp_skb->sk);
1419 tmp_skb->destructor = NULL;
1420 tmp_skb->sk = NULL;
1da177e4
LT
1421 }
1422
28a89453 1423 /* Allow local fragmentation. */
b5c15fc0 1424 if (np->pmtudisc < IPV6_PMTUDISC_DO)
28a89453
HX
1425 skb->local_df = 1;
1426
1da177e4 1427 ipv6_addr_copy(final_dst, &fl->fl6_dst);
cfe1fc77 1428 __skb_pull(skb, skb_network_header_len(skb));
1da177e4
LT
1429 if (opt && opt->opt_flen)
1430 ipv6_push_frag_opts(skb, opt, &proto);
1431 if (opt && opt->opt_nflen)
1432 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1433
e2d1bca7
ACM
1434 skb_push(skb, sizeof(struct ipv6hdr));
1435 skb_reset_network_header(skb);
0660e03f 1436 hdr = ipv6_hdr(skb);
1ab1457c 1437
90bcaf7b 1438 *(__be32*)hdr = fl->fl6_flowlabel |
41a1f8ea 1439 htonl(0x60000000 | ((int)np->cork.tclass << 20));
1da177e4 1440
1da177e4
LT
1441 hdr->hop_limit = np->cork.hop_limit;
1442 hdr->nexthdr = proto;
1443 ipv6_addr_copy(&hdr->saddr, &fl->fl6_src);
1444 ipv6_addr_copy(&hdr->daddr, final_dst);
1445
a2c2064f 1446 skb->priority = sk->sk_priority;
4a19ec58 1447 skb->mark = sk->sk_mark;
a2c2064f 1448
1da177e4 1449 skb->dst = dst_clone(&rt->u.dst);
a11d206d 1450 IP6_INC_STATS(rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
14878f75
DS
1451 if (proto == IPPROTO_ICMPV6) {
1452 struct inet6_dev *idev = ip6_dst_idev(skb->dst);
1453
1454 ICMP6MSGOUT_INC_STATS_BH(idev, icmp6_hdr(skb)->icmp6_type);
1455 ICMP6_INC_STATS_BH(idev, ICMP6_MIB_OUTMSGS);
1456 }
1457
ef76bc23 1458 err = ip6_local_out(skb);
1da177e4
LT
1459 if (err) {
1460 if (err > 0)
3320da89 1461 err = np->recverr ? net_xmit_errno(err) : 0;
1da177e4
LT
1462 if (err)
1463 goto error;
1464 }
1465
1466out:
bf138862 1467 ip6_cork_release(inet, np);
1da177e4
LT
1468 return err;
1469error:
1470 goto out;
1471}
1472
1473void ip6_flush_pending_frames(struct sock *sk)
1474{
1da177e4
LT
1475 struct sk_buff *skb;
1476
1477 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
e1f52208
YH
1478 if (skb->dst)
1479 IP6_INC_STATS(ip6_dst_idev(skb->dst),
1480 IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1481 kfree_skb(skb);
1482 }
1483
bf138862 1484 ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1da177e4 1485}