]> bbs.cooldavid.org Git - net-next-2.6.git/blame - net/ipv4/ip_output.c
ac3200: fix error path
[net-next-2.6.git] / net / ipv4 / ip_output.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * The Internet Protocol (IP) output module.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Donald Becker, <becker@super.org>
11 * Alan Cox, <Alan.Cox@linux.org>
12 * Richard Underwood
13 * Stefan Becker, <stefanb@yello.ping.de>
14 * Jorge Cwik, <jorge@laser.satlink.net>
15 * Arnt Gulbrandsen, <agulbra@nvg.unit.no>
16 * Hirokazu Takahashi, <taka@valinux.co.jp>
17 *
18 * See ip_input.c for original log
19 *
20 * Fixes:
21 * Alan Cox : Missing nonblock feature in ip_build_xmit.
22 * Mike Kilburn : htons() missing in ip_build_xmit.
e905a9ed 23 * Bradford Johnson: Fix faulty handling of some frames when
1da177e4
LT
24 * no route is found.
25 * Alexander Demenshin: Missing sk/skb free in ip_queue_xmit
26 * (in case if packet not accepted by
27 * output firewall rules)
28 * Mike McLagan : Routing by source
29 * Alexey Kuznetsov: use new route cache
30 * Andi Kleen: Fix broken PMTU recovery and remove
31 * some redundant tests.
32 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
33 * Andi Kleen : Replace ip_reply with ip_send_reply.
e905a9ed
YH
34 * Andi Kleen : Split fast and slow ip_build_xmit path
35 * for decreased register pressure on x86
36 * and more readibility.
1da177e4
LT
37 * Marc Boucher : When call_out_firewall returns FW_QUEUE,
38 * silently drop skb instead of failing with -EPERM.
39 * Detlev Wengorz : Copy protocol for fragments.
40 * Hirokazu Takahashi: HW checksumming for outgoing UDP
41 * datagrams.
42 * Hirokazu Takahashi: sendfile() on UDP works now.
43 */
44
45#include <asm/uaccess.h>
46#include <asm/system.h>
47#include <linux/module.h>
48#include <linux/types.h>
49#include <linux/kernel.h>
1da177e4
LT
50#include <linux/mm.h>
51#include <linux/string.h>
52#include <linux/errno.h>
a1f8e7f7 53#include <linux/highmem.h>
5a0e3ad6 54#include <linux/slab.h>
1da177e4
LT
55
56#include <linux/socket.h>
57#include <linux/sockios.h>
58#include <linux/in.h>
59#include <linux/inet.h>
60#include <linux/netdevice.h>
61#include <linux/etherdevice.h>
62#include <linux/proc_fs.h>
63#include <linux/stat.h>
64#include <linux/init.h>
65
66#include <net/snmp.h>
67#include <net/ip.h>
68#include <net/protocol.h>
69#include <net/route.h>
cfacb057 70#include <net/xfrm.h>
1da177e4
LT
71#include <linux/skbuff.h>
72#include <net/sock.h>
73#include <net/arp.h>
74#include <net/icmp.h>
1da177e4
LT
75#include <net/checksum.h>
76#include <net/inetpeer.h>
1da177e4
LT
77#include <linux/igmp.h>
78#include <linux/netfilter_ipv4.h>
79#include <linux/netfilter_bridge.h>
80#include <linux/mroute.h>
81#include <linux/netlink.h>
6cbb0df7 82#include <linux/tcp.h>
1da177e4 83
ab32ea5d 84int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
1da177e4
LT
85
86/* Generate a checksum for an outgoing IP datagram. */
87__inline__ void ip_send_check(struct iphdr *iph)
88{
89 iph->check = 0;
90 iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
91}
92
c439cb2e
HX
93int __ip_local_out(struct sk_buff *skb)
94{
95 struct iphdr *iph = ip_hdr(skb);
96
97 iph->tot_len = htons(skb->len);
98 ip_send_check(iph);
9bbc768a
JE
99 return nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, skb, NULL,
100 skb_dst(skb)->dev, dst_output);
c439cb2e
HX
101}
102
103int ip_local_out(struct sk_buff *skb)
104{
105 int err;
106
107 err = __ip_local_out(skb);
108 if (likely(err == 1))
109 err = dst_output(skb);
110
111 return err;
112}
113EXPORT_SYMBOL_GPL(ip_local_out);
114
1da177e4
LT
115/* dev_loopback_xmit for use with netfilter. */
116static int ip_dev_loopback_xmit(struct sk_buff *newskb)
117{
459a98ed 118 skb_reset_mac_header(newskb);
bbe735e4 119 __skb_pull(newskb, skb_network_offset(newskb));
1da177e4
LT
120 newskb->pkt_type = PACKET_LOOPBACK;
121 newskb->ip_summed = CHECKSUM_UNNECESSARY;
adf30907 122 WARN_ON(!skb_dst(newskb));
e30b38c2 123 netif_rx_ni(newskb);
1da177e4
LT
124 return 0;
125}
126
127static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
128{
129 int ttl = inet->uc_ttl;
130
131 if (ttl < 0)
132 ttl = dst_metric(dst, RTAX_HOPLIMIT);
133 return ttl;
134}
135
e905a9ed 136/*
1da177e4
LT
137 * Add an ip header to a skbuff and send it out.
138 *
139 */
140int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
13d8eaa0 141 __be32 saddr, __be32 daddr, struct ip_options *opt)
1da177e4
LT
142{
143 struct inet_sock *inet = inet_sk(sk);
511c3f92 144 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
145 struct iphdr *iph;
146
147 /* Build the IP header. */
8856dfa3
ACM
148 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
149 skb_reset_network_header(skb);
eddc9ec5 150 iph = ip_hdr(skb);
1da177e4
LT
151 iph->version = 4;
152 iph->ihl = 5;
153 iph->tos = inet->tos;
d8d1f30b 154 if (ip_dont_fragment(sk, &rt->dst))
1da177e4
LT
155 iph->frag_off = htons(IP_DF);
156 else
157 iph->frag_off = 0;
d8d1f30b 158 iph->ttl = ip_select_ttl(inet, &rt->dst);
1da177e4
LT
159 iph->daddr = rt->rt_dst;
160 iph->saddr = rt->rt_src;
161 iph->protocol = sk->sk_protocol;
d8d1f30b 162 ip_select_ident(iph, &rt->dst, sk);
1da177e4
LT
163
164 if (opt && opt->optlen) {
165 iph->ihl += opt->optlen>>2;
166 ip_options_build(skb, opt, daddr, rt, 0);
167 }
1da177e4
LT
168
169 skb->priority = sk->sk_priority;
4a19ec58 170 skb->mark = sk->sk_mark;
1da177e4
LT
171
172 /* Send it out. */
c439cb2e 173 return ip_local_out(skb);
1da177e4
LT
174}
175
d8c97a94
ACM
176EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
177
1da177e4
LT
178static inline int ip_finish_output2(struct sk_buff *skb)
179{
adf30907 180 struct dst_entry *dst = skb_dst(skb);
80787ebc 181 struct rtable *rt = (struct rtable *)dst;
1da177e4 182 struct net_device *dev = dst->dev;
c2636b4d 183 unsigned int hh_len = LL_RESERVED_SPACE(dev);
1da177e4 184
edf391ff
NH
185 if (rt->rt_type == RTN_MULTICAST) {
186 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTMCAST, skb->len);
187 } else if (rt->rt_type == RTN_BROADCAST)
188 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUTBCAST, skb->len);
80787ebc 189
1da177e4 190 /* Be paranoid, rather than too clever. */
3b04ddde 191 if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
1da177e4
LT
192 struct sk_buff *skb2;
193
194 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
195 if (skb2 == NULL) {
196 kfree_skb(skb);
197 return -ENOMEM;
198 }
199 if (skb->sk)
200 skb_set_owner_w(skb2, skb->sk);
201 kfree_skb(skb);
202 skb = skb2;
203 }
204
3644f0ce
SH
205 if (dst->hh)
206 return neigh_hh_output(dst->hh, skb);
207 else if (dst->neighbour)
1da177e4
LT
208 return dst->neighbour->output(skb);
209
210 if (net_ratelimit())
211 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
212 kfree_skb(skb);
213 return -EINVAL;
214}
215
628a5c56
JH
216static inline int ip_skb_dst_mtu(struct sk_buff *skb)
217{
218 struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
219
220 return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
adf30907 221 skb_dst(skb)->dev->mtu : dst_mtu(skb_dst(skb));
628a5c56
JH
222}
223
861d0486 224static int ip_finish_output(struct sk_buff *skb)
1da177e4 225{
5c901daa
PM
226#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
227 /* Policy lookup after SNAT yielded a new policy */
adf30907 228 if (skb_dst(skb)->xfrm != NULL) {
48d5cad8
PM
229 IPCB(skb)->flags |= IPSKB_REROUTED;
230 return dst_output(skb);
231 }
5c901daa 232#endif
628a5c56 233 if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
1bd9bef6
PM
234 return ip_fragment(skb, ip_finish_output2);
235 else
236 return ip_finish_output2(skb);
1da177e4
LT
237}
238
239int ip_mc_output(struct sk_buff *skb)
240{
241 struct sock *sk = skb->sk;
511c3f92 242 struct rtable *rt = skb_rtable(skb);
d8d1f30b 243 struct net_device *dev = rt->dst.dev;
1da177e4
LT
244
245 /*
246 * If the indicated interface is up and running, send the packet.
247 */
edf391ff 248 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
1da177e4
LT
249
250 skb->dev = dev;
251 skb->protocol = htons(ETH_P_IP);
252
253 /*
254 * Multicasts are looped back for other local users
255 */
256
257 if (rt->rt_flags&RTCF_MULTICAST) {
7ad6848c 258 if (sk_mc_loop(sk)
1da177e4
LT
259#ifdef CONFIG_IP_MROUTE
260 /* Small optimization: do not loopback not local frames,
261 which returned after forwarding; they will be dropped
262 by ip_mr_input in any case.
263 Note, that local frames are looped back to be delivered
264 to local recipients.
265
266 This check is duplicated in ip_mr_input at the moment.
267 */
9d4fb27d
JP
268 &&
269 ((rt->rt_flags & RTCF_LOCAL) ||
270 !(IPCB(skb)->flags & IPSKB_FORWARDED))
1da177e4 271#endif
9d4fb27d 272 ) {
1da177e4
LT
273 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
274 if (newskb)
9bbc768a
JE
275 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING,
276 newskb, NULL, newskb->dev,
1da177e4
LT
277 ip_dev_loopback_xmit);
278 }
279
280 /* Multicasts with ttl 0 must not go beyond the host */
281
eddc9ec5 282 if (ip_hdr(skb)->ttl == 0) {
1da177e4
LT
283 kfree_skb(skb);
284 return 0;
285 }
286 }
287
288 if (rt->rt_flags&RTCF_BROADCAST) {
289 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
290 if (newskb)
9bbc768a
JE
291 NF_HOOK(NFPROTO_IPV4, NF_INET_POST_ROUTING, newskb,
292 NULL, newskb->dev, ip_dev_loopback_xmit);
1da177e4
LT
293 }
294
9bbc768a
JE
295 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL,
296 skb->dev, ip_finish_output,
48d5cad8 297 !(IPCB(skb)->flags & IPSKB_REROUTED));
1da177e4
LT
298}
299
300int ip_output(struct sk_buff *skb)
301{
adf30907 302 struct net_device *dev = skb_dst(skb)->dev;
1bd9bef6 303
edf391ff 304 IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
1da177e4 305
1bd9bef6
PM
306 skb->dev = dev;
307 skb->protocol = htons(ETH_P_IP);
308
9bbc768a 309 return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, NULL, dev,
e905a9ed 310 ip_finish_output,
48d5cad8 311 !(IPCB(skb)->flags & IPSKB_REROUTED));
1da177e4
LT
312}
313
4e15ed4d 314int ip_queue_xmit(struct sk_buff *skb)
1da177e4 315{
e89862f4 316 struct sock *sk = skb->sk;
1da177e4
LT
317 struct inet_sock *inet = inet_sk(sk);
318 struct ip_options *opt = inet->opt;
319 struct rtable *rt;
320 struct iphdr *iph;
ab6e3feb 321 int res;
1da177e4
LT
322
323 /* Skip all of this if the packet is already routed,
324 * f.e. by something like SCTP.
325 */
ab6e3feb 326 rcu_read_lock();
511c3f92 327 rt = skb_rtable(skb);
1da177e4
LT
328 if (rt != NULL)
329 goto packet_routed;
330
331 /* Make sure we can route this packet. */
332 rt = (struct rtable *)__sk_dst_check(sk, 0);
333 if (rt == NULL) {
3ca3c68e 334 __be32 daddr;
1da177e4
LT
335
336 /* Use correct destination address if we have options. */
c720c7e8 337 daddr = inet->inet_daddr;
1da177e4
LT
338 if(opt && opt->srr)
339 daddr = opt->faddr;
340
341 {
342 struct flowi fl = { .oif = sk->sk_bound_dev_if,
914a9ab3 343 .mark = sk->sk_mark,
1da177e4
LT
344 .nl_u = { .ip4_u =
345 { .daddr = daddr,
c720c7e8 346 .saddr = inet->inet_saddr,
1da177e4
LT
347 .tos = RT_CONN_FLAGS(sk) } },
348 .proto = sk->sk_protocol,
86b08d86 349 .flags = inet_sk_flowi_flags(sk),
1da177e4 350 .uli_u = { .ports =
c720c7e8
ED
351 { .sport = inet->inet_sport,
352 .dport = inet->inet_dport } } };
1da177e4
LT
353
354 /* If this fails, retransmit mechanism of transport layer will
355 * keep trying until route appears or the connection times
356 * itself out.
357 */
beb8d13b 358 security_sk_classify_flow(sk, &fl);
3b1e0a65 359 if (ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0))
1da177e4
LT
360 goto no_route;
361 }
d8d1f30b 362 sk_setup_caps(sk, &rt->dst);
1da177e4 363 }
d8d1f30b 364 skb_dst_set_noref(skb, &rt->dst);
1da177e4
LT
365
366packet_routed:
367 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
368 goto no_route;
369
370 /* OK, we know where to send it, allocate and build IP header. */
8856dfa3
ACM
371 skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
372 skb_reset_network_header(skb);
eddc9ec5 373 iph = ip_hdr(skb);
714e85be 374 *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
d8d1f30b 375 if (ip_dont_fragment(sk, &rt->dst) && !skb->local_df)
1da177e4
LT
376 iph->frag_off = htons(IP_DF);
377 else
378 iph->frag_off = 0;
d8d1f30b 379 iph->ttl = ip_select_ttl(inet, &rt->dst);
1da177e4
LT
380 iph->protocol = sk->sk_protocol;
381 iph->saddr = rt->rt_src;
382 iph->daddr = rt->rt_dst;
1da177e4
LT
383 /* Transport layer set skb->h.foo itself. */
384
385 if (opt && opt->optlen) {
386 iph->ihl += opt->optlen >> 2;
c720c7e8 387 ip_options_build(skb, opt, inet->inet_daddr, rt, 0);
1da177e4
LT
388 }
389
d8d1f30b 390 ip_select_ident_more(iph, &rt->dst, sk,
7967168c 391 (skb_shinfo(skb)->gso_segs ?: 1) - 1);
1da177e4 392
1da177e4 393 skb->priority = sk->sk_priority;
4a19ec58 394 skb->mark = sk->sk_mark;
1da177e4 395
ab6e3feb
ED
396 res = ip_local_out(skb);
397 rcu_read_unlock();
398 return res;
1da177e4
LT
399
400no_route:
ab6e3feb 401 rcu_read_unlock();
5e38e270 402 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
403 kfree_skb(skb);
404 return -EHOSTUNREACH;
405}
406
407
408static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
409{
410 to->pkt_type = from->pkt_type;
411 to->priority = from->priority;
412 to->protocol = from->protocol;
adf30907 413 skb_dst_drop(to);
fe76cda3 414 skb_dst_copy(to, from);
1da177e4 415 to->dev = from->dev;
82e91ffe 416 to->mark = from->mark;
1da177e4
LT
417
418 /* Copy the flags to each fragment. */
419 IPCB(to)->flags = IPCB(from)->flags;
420
421#ifdef CONFIG_NET_SCHED
422 to->tc_index = from->tc_index;
423#endif
e7ac05f3 424 nf_copy(to, from);
ba9dda3a
JK
425#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
426 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
427 to->nf_trace = from->nf_trace;
428#endif
c98d80ed
JA
429#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
430 to->ipvs_property = from->ipvs_property;
1da177e4 431#endif
984bc16c 432 skb_copy_secmark(to, from);
1da177e4
LT
433}
434
435/*
436 * This IP datagram is too large to be sent in one piece. Break it up into
437 * smaller pieces (each of size equal to IP header plus
438 * a block of the data of the original IP data part) that will yet fit in a
439 * single device frame, and queue such a frame for sending.
440 */
441
d9319100 442int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
1da177e4
LT
443{
444 struct iphdr *iph;
1da177e4
LT
445 int ptr;
446 struct net_device *dev;
447 struct sk_buff *skb2;
9bcfcaf5 448 unsigned int mtu, hlen, left, len, ll_rs, pad;
1da177e4 449 int offset;
76ab608d 450 __be16 not_last_frag;
511c3f92 451 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
452 int err = 0;
453
d8d1f30b 454 dev = rt->dst.dev;
1da177e4
LT
455
456 /*
457 * Point into the IP datagram header.
458 */
459
eddc9ec5 460 iph = ip_hdr(skb);
1da177e4
LT
461
462 if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
5e38e270 463 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4 464 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
628a5c56 465 htonl(ip_skb_dst_mtu(skb)));
1da177e4
LT
466 kfree_skb(skb);
467 return -EMSGSIZE;
468 }
469
470 /*
471 * Setup starting values.
472 */
473
474 hlen = iph->ihl * 4;
d8d1f30b 475 mtu = dst_mtu(&rt->dst) - hlen; /* Size of data space */
6c79bf0f
BDS
476#ifdef CONFIG_BRIDGE_NETFILTER
477 if (skb->nf_bridge)
478 mtu -= nf_bridge_mtu_reduction(skb);
479#endif
89cee8b1 480 IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
1da177e4
LT
481
482 /* When frag_list is given, use it. First, check its validity:
483 * some transformers could create wrong frag_list or break existing
484 * one, it is not prohibited. In this case fall back to copying.
485 *
486 * LATER: this step can be merged to real generation of fragments,
487 * we can switch to copy when see the first bad fragment.
488 */
d7fcf1a5 489 if (skb_has_frags(skb)) {
1da177e4
LT
490 struct sk_buff *frag;
491 int first_len = skb_pagelen(skb);
29ffe1a5 492 int truesizes = 0;
1da177e4
LT
493
494 if (first_len - hlen > mtu ||
495 ((first_len - hlen) & 7) ||
496 (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
497 skb_cloned(skb))
498 goto slow_path;
499
d7fcf1a5 500 skb_walk_frags(skb, frag) {
1da177e4
LT
501 /* Correct geometry. */
502 if (frag->len > mtu ||
503 ((frag->len & 7) && frag->next) ||
504 skb_headroom(frag) < hlen)
505 goto slow_path;
506
507 /* Partially cloned skb? */
508 if (skb_shared(frag))
509 goto slow_path;
2fdba6b0
HX
510
511 BUG_ON(frag->sk);
512 if (skb->sk) {
2fdba6b0
HX
513 frag->sk = skb->sk;
514 frag->destructor = sock_wfree;
2fdba6b0 515 }
b2722b1c 516 truesizes += frag->truesize;
1da177e4
LT
517 }
518
519 /* Everything is OK. Generate! */
520
521 err = 0;
522 offset = 0;
523 frag = skb_shinfo(skb)->frag_list;
d7fcf1a5 524 skb_frag_list_init(skb);
1da177e4 525 skb->data_len = first_len - skb_headlen(skb);
29ffe1a5 526 skb->truesize -= truesizes;
1da177e4
LT
527 skb->len = first_len;
528 iph->tot_len = htons(first_len);
529 iph->frag_off = htons(IP_MF);
530 ip_send_check(iph);
531
532 for (;;) {
533 /* Prepare header of the next frame,
534 * before previous one went down. */
535 if (frag) {
536 frag->ip_summed = CHECKSUM_NONE;
badff6d0 537 skb_reset_transport_header(frag);
e2d1bca7
ACM
538 __skb_push(frag, hlen);
539 skb_reset_network_header(frag);
d56f90a7 540 memcpy(skb_network_header(frag), iph, hlen);
eddc9ec5 541 iph = ip_hdr(frag);
1da177e4
LT
542 iph->tot_len = htons(frag->len);
543 ip_copy_metadata(frag, skb);
544 if (offset == 0)
545 ip_options_fragment(frag);
546 offset += skb->len - hlen;
547 iph->frag_off = htons(offset>>3);
548 if (frag->next != NULL)
549 iph->frag_off |= htons(IP_MF);
550 /* Ready, complete checksum */
551 ip_send_check(iph);
552 }
553
554 err = output(skb);
555
dafee490 556 if (!err)
5e38e270 557 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
1da177e4
LT
558 if (err || !frag)
559 break;
560
561 skb = frag;
562 frag = skb->next;
563 skb->next = NULL;
564 }
565
566 if (err == 0) {
5e38e270 567 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
1da177e4
LT
568 return 0;
569 }
570
571 while (frag) {
572 skb = frag->next;
573 kfree_skb(frag);
574 frag = skb;
575 }
5e38e270 576 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
577 return err;
578 }
579
580slow_path:
581 left = skb->len - hlen; /* Space per frame */
49085bd7 582 ptr = hlen; /* Where to start from */
1da177e4 583
1da177e4 584 /* for bridged IP traffic encapsulated inside f.e. a vlan header,
9bcfcaf5
SH
585 * we need to make room for the encapsulating header
586 */
587 pad = nf_bridge_pad(skb);
d8d1f30b 588 ll_rs = LL_RESERVED_SPACE_EXTRA(rt->dst.dev, pad);
9bcfcaf5
SH
589 mtu -= pad;
590
1da177e4
LT
591 /*
592 * Fragment the datagram.
593 */
594
595 offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
596 not_last_frag = iph->frag_off & htons(IP_MF);
597
598 /*
599 * Keep copying data until we run out.
600 */
601
132adf54 602 while (left > 0) {
1da177e4
LT
603 len = left;
604 /* IF: it doesn't fit, use 'mtu' - the data space left */
605 if (len > mtu)
606 len = mtu;
607 /* IF: we are not sending upto and including the packet end
608 then align the next start on an eight byte boundary */
609 if (len < left) {
610 len &= ~7;
611 }
612 /*
613 * Allocate buffer.
614 */
615
616 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
64ce2073 617 NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
1da177e4
LT
618 err = -ENOMEM;
619 goto fail;
620 }
621
622 /*
623 * Set up data on packet
624 */
625
626 ip_copy_metadata(skb2, skb);
627 skb_reserve(skb2, ll_rs);
628 skb_put(skb2, len + hlen);
c1d2bbe1 629 skb_reset_network_header(skb2);
b0e380b1 630 skb2->transport_header = skb2->network_header + hlen;
1da177e4
LT
631
632 /*
633 * Charge the memory for the fragment to any owner
634 * it might possess
635 */
636
637 if (skb->sk)
638 skb_set_owner_w(skb2, skb->sk);
639
640 /*
641 * Copy the packet header into the new buffer.
642 */
643
d626f62b 644 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
1da177e4
LT
645
646 /*
647 * Copy a block of the IP datagram.
648 */
bff9b61c 649 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
1da177e4
LT
650 BUG();
651 left -= len;
652
653 /*
654 * Fill in the new header fields.
655 */
eddc9ec5 656 iph = ip_hdr(skb2);
1da177e4
LT
657 iph->frag_off = htons((offset >> 3));
658
659 /* ANK: dirty, but effective trick. Upgrade options only if
660 * the segment to be fragmented was THE FIRST (otherwise,
661 * options are already fixed) and make it ONCE
662 * on the initial skb, so that all the following fragments
663 * will inherit fixed options.
664 */
665 if (offset == 0)
666 ip_options_fragment(skb);
667
668 /*
669 * Added AC : If we are fragmenting a fragment that's not the
670 * last fragment then keep MF on each bit
671 */
672 if (left > 0 || not_last_frag)
673 iph->frag_off |= htons(IP_MF);
674 ptr += len;
675 offset += len;
676
677 /*
678 * Put this fragment into the sending queue.
679 */
1da177e4
LT
680 iph->tot_len = htons(len + hlen);
681
682 ip_send_check(iph);
683
684 err = output(skb2);
685 if (err)
686 goto fail;
dafee490 687
5e38e270 688 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGCREATES);
1da177e4
LT
689 }
690 kfree_skb(skb);
5e38e270 691 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGOKS);
1da177e4
LT
692 return err;
693
694fail:
e905a9ed 695 kfree_skb(skb);
5e38e270 696 IP_INC_STATS(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1da177e4
LT
697 return err;
698}
699
2e2f7aef
PM
700EXPORT_SYMBOL(ip_fragment);
701
1da177e4
LT
702int
703ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
704{
705 struct iovec *iov = from;
706
84fa7933 707 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1da177e4
LT
708 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
709 return -EFAULT;
710 } else {
44bb9363 711 __wsum csum = 0;
1da177e4
LT
712 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
713 return -EFAULT;
714 skb->csum = csum_block_add(skb->csum, csum, odd);
715 }
716 return 0;
717}
718
44bb9363 719static inline __wsum
1da177e4
LT
720csum_page(struct page *page, int offset, int copy)
721{
722 char *kaddr;
44bb9363 723 __wsum csum;
1da177e4
LT
724 kaddr = kmap(page);
725 csum = csum_partial(kaddr + offset, copy, 0);
726 kunmap(page);
727 return csum;
728}
729
4b30b1c6 730static inline int ip_ufo_append_data(struct sock *sk,
e89e9cf5
AR
731 int getfrag(void *from, char *to, int offset, int len,
732 int odd, struct sk_buff *skb),
733 void *from, int length, int hh_len, int fragheaderlen,
d9319100 734 int transhdrlen, int mtu, unsigned int flags)
e89e9cf5
AR
735{
736 struct sk_buff *skb;
737 int err;
738
739 /* There is support for UDP fragmentation offload by network
740 * device, so create one single skb packet containing complete
741 * udp datagram
742 */
743 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
744 skb = sock_alloc_send_skb(sk,
745 hh_len + fragheaderlen + transhdrlen + 20,
746 (flags & MSG_DONTWAIT), &err);
747
748 if (skb == NULL)
749 return err;
750
751 /* reserve space for Hardware header */
752 skb_reserve(skb, hh_len);
753
754 /* create space for UDP/IP header */
d9319100 755 skb_put(skb, fragheaderlen + transhdrlen);
e89e9cf5
AR
756
757 /* initialize network header pointer */
c1d2bbe1 758 skb_reset_network_header(skb);
e89e9cf5
AR
759
760 /* initialize protocol header pointer */
b0e380b1 761 skb->transport_header = skb->network_header + fragheaderlen;
e89e9cf5 762
84fa7933 763 skb->ip_summed = CHECKSUM_PARTIAL;
e89e9cf5
AR
764 skb->csum = 0;
765 sk->sk_sndmsg_off = 0;
e89e9cf5 766
be9164e7 767 /* specify the length of each IP datagram fragment */
7967168c 768 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
f83ef8c0 769 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
e89e9cf5 770 __skb_queue_tail(&sk->sk_write_queue, skb);
e89e9cf5 771 }
be9164e7
K
772
773 return skb_append_datato_frags(sk, skb, getfrag, from,
774 (length - transhdrlen));
e89e9cf5
AR
775}
776
1da177e4
LT
777/*
778 * ip_append_data() and ip_append_page() can make one large IP datagram
779 * from many pieces of data. Each pieces will be holded on the socket
780 * until ip_push_pending_frames() is called. Each piece can be a page
781 * or non-page data.
e905a9ed 782 *
1da177e4
LT
783 * Not only UDP, other transport protocols - e.g. raw sockets - can use
784 * this interface potentially.
785 *
786 * LATER: length must be adjusted by pad at tail, when it is required.
787 */
788int ip_append_data(struct sock *sk,
789 int getfrag(void *from, char *to, int offset, int len,
790 int odd, struct sk_buff *skb),
791 void *from, int length, int transhdrlen,
2e77d89b 792 struct ipcm_cookie *ipc, struct rtable **rtp,
1da177e4
LT
793 unsigned int flags)
794{
795 struct inet_sock *inet = inet_sk(sk);
796 struct sk_buff *skb;
797
798 struct ip_options *opt = NULL;
799 int hh_len;
800 int exthdrlen;
801 int mtu;
802 int copy;
803 int err;
804 int offset = 0;
805 unsigned int maxfraglen, fragheaderlen;
806 int csummode = CHECKSUM_NONE;
2e77d89b 807 struct rtable *rt;
1da177e4
LT
808
809 if (flags&MSG_PROBE)
810 return 0;
811
812 if (skb_queue_empty(&sk->sk_write_queue)) {
813 /*
814 * setup for corking.
815 */
816 opt = ipc->opt;
817 if (opt) {
818 if (inet->cork.opt == NULL) {
819 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
820 if (unlikely(inet->cork.opt == NULL))
821 return -ENOBUFS;
822 }
823 memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
824 inet->cork.flags |= IPCORK_OPT;
825 inet->cork.addr = ipc->addr;
826 }
2e77d89b 827 rt = *rtp;
788d908f
JT
828 if (unlikely(!rt))
829 return -EFAULT;
2e77d89b
ED
830 /*
831 * We steal reference to this route, caller should not release it
832 */
833 *rtp = NULL;
628a5c56 834 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
d8d1f30b
CG
835 rt->dst.dev->mtu :
836 dst_mtu(rt->dst.path);
837 inet->cork.dst = &rt->dst;
1da177e4
LT
838 inet->cork.length = 0;
839 sk->sk_sndmsg_page = NULL;
840 sk->sk_sndmsg_off = 0;
d8d1f30b 841 if ((exthdrlen = rt->dst.header_len) != 0) {
1da177e4
LT
842 length += exthdrlen;
843 transhdrlen += exthdrlen;
844 }
845 } else {
c8cdaf99 846 rt = (struct rtable *)inet->cork.dst;
1da177e4
LT
847 if (inet->cork.flags & IPCORK_OPT)
848 opt = inet->cork.opt;
849
850 transhdrlen = 0;
851 exthdrlen = 0;
852 mtu = inet->cork.fragsize;
853 }
d8d1f30b 854 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1da177e4
LT
855
856 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
857 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
858
859 if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
c720c7e8
ED
860 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport,
861 mtu-exthdrlen);
1da177e4
LT
862 return -EMSGSIZE;
863 }
864
865 /*
866 * transhdrlen > 0 means that this is the first fragment and we wish
867 * it won't be fragmented in the future.
868 */
869 if (transhdrlen &&
870 length + fragheaderlen <= mtu &&
d8d1f30b 871 rt->dst.dev->features & NETIF_F_V4_CSUM &&
1da177e4 872 !exthdrlen)
84fa7933 873 csummode = CHECKSUM_PARTIAL;
1da177e4 874
26cde9f7
HX
875 skb = skb_peek_tail(&sk->sk_write_queue);
876
1da177e4 877 inet->cork.length += length;
26cde9f7 878 if (((length > mtu) || (skb && skb_is_gso(skb))) &&
be9164e7 879 (sk->sk_protocol == IPPROTO_UDP) &&
d8d1f30b 880 (rt->dst.dev->features & NETIF_F_UFO)) {
baa829d8
PM
881 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
882 fragheaderlen, transhdrlen, mtu,
883 flags);
884 if (err)
e89e9cf5 885 goto error;
e89e9cf5
AR
886 return 0;
887 }
1da177e4
LT
888
889 /* So, what's going on in the loop below?
890 *
891 * We use calculated fragment length to generate chained skb,
892 * each of segments is IP fragment ready for sending to network after
893 * adding appropriate IP header.
894 */
895
26cde9f7 896 if (!skb)
1da177e4
LT
897 goto alloc_new_skb;
898
899 while (length > 0) {
900 /* Check if the remaining data fits into current packet. */
901 copy = mtu - skb->len;
902 if (copy < length)
903 copy = maxfraglen - skb->len;
904 if (copy <= 0) {
905 char *data;
906 unsigned int datalen;
907 unsigned int fraglen;
908 unsigned int fraggap;
909 unsigned int alloclen;
910 struct sk_buff *skb_prev;
911alloc_new_skb:
912 skb_prev = skb;
913 if (skb_prev)
914 fraggap = skb_prev->len - maxfraglen;
915 else
916 fraggap = 0;
917
918 /*
919 * If remaining data exceeds the mtu,
920 * we know we need more fragment(s).
921 */
922 datalen = length + fraggap;
923 if (datalen > mtu - fragheaderlen)
924 datalen = maxfraglen - fragheaderlen;
925 fraglen = datalen + fragheaderlen;
926
e905a9ed 927 if ((flags & MSG_MORE) &&
d8d1f30b 928 !(rt->dst.dev->features&NETIF_F_SG))
1da177e4
LT
929 alloclen = mtu;
930 else
931 alloclen = datalen + fragheaderlen;
932
933 /* The last fragment gets additional space at tail.
934 * Note, with MSG_MORE we overallocate on fragments,
935 * because we have no idea what fragment will be
936 * the last.
937 */
3d9dd756 938 if (datalen == length + fraggap)
d8d1f30b 939 alloclen += rt->dst.trailer_len;
1da177e4
LT
940
941 if (transhdrlen) {
e905a9ed 942 skb = sock_alloc_send_skb(sk,
1da177e4
LT
943 alloclen + hh_len + 15,
944 (flags & MSG_DONTWAIT), &err);
945 } else {
946 skb = NULL;
947 if (atomic_read(&sk->sk_wmem_alloc) <=
948 2 * sk->sk_sndbuf)
e905a9ed 949 skb = sock_wmalloc(sk,
1da177e4
LT
950 alloclen + hh_len + 15, 1,
951 sk->sk_allocation);
952 if (unlikely(skb == NULL))
953 err = -ENOBUFS;
51f31cab
PO
954 else
955 /* only the initial fragment is
956 time stamped */
957 ipc->shtx.flags = 0;
1da177e4
LT
958 }
959 if (skb == NULL)
960 goto error;
961
962 /*
963 * Fill in the control structures
964 */
965 skb->ip_summed = csummode;
966 skb->csum = 0;
967 skb_reserve(skb, hh_len);
51f31cab 968 *skb_tx(skb) = ipc->shtx;
1da177e4
LT
969
970 /*
971 * Find where to start putting bytes.
972 */
973 data = skb_put(skb, fraglen);
c14d2450 974 skb_set_network_header(skb, exthdrlen);
b0e380b1
ACM
975 skb->transport_header = (skb->network_header +
976 fragheaderlen);
1da177e4 977 data += fragheaderlen;
1da177e4
LT
978
979 if (fraggap) {
980 skb->csum = skb_copy_and_csum_bits(
981 skb_prev, maxfraglen,
982 data + transhdrlen, fraggap, 0);
983 skb_prev->csum = csum_sub(skb_prev->csum,
984 skb->csum);
985 data += fraggap;
e9fa4f7b 986 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
987 }
988
989 copy = datalen - transhdrlen - fraggap;
990 if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
991 err = -EFAULT;
992 kfree_skb(skb);
993 goto error;
994 }
995
996 offset += copy;
997 length -= datalen - fraggap;
998 transhdrlen = 0;
999 exthdrlen = 0;
1000 csummode = CHECKSUM_NONE;
1001
1002 /*
1003 * Put the packet on the pending queue.
1004 */
1005 __skb_queue_tail(&sk->sk_write_queue, skb);
1006 continue;
1007 }
1008
1009 if (copy > length)
1010 copy = length;
1011
d8d1f30b 1012 if (!(rt->dst.dev->features&NETIF_F_SG)) {
1da177e4
LT
1013 unsigned int off;
1014
1015 off = skb->len;
e905a9ed 1016 if (getfrag(from, skb_put(skb, copy),
1da177e4
LT
1017 offset, copy, off, skb) < 0) {
1018 __skb_trim(skb, off);
1019 err = -EFAULT;
1020 goto error;
1021 }
1022 } else {
1023 int i = skb_shinfo(skb)->nr_frags;
1024 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1025 struct page *page = sk->sk_sndmsg_page;
1026 int off = sk->sk_sndmsg_off;
1027 unsigned int left;
1028
1029 if (page && (left = PAGE_SIZE - off) > 0) {
1030 if (copy >= left)
1031 copy = left;
1032 if (page != frag->page) {
1033 if (i == MAX_SKB_FRAGS) {
1034 err = -EMSGSIZE;
1035 goto error;
1036 }
1037 get_page(page);
e905a9ed 1038 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1da177e4
LT
1039 frag = &skb_shinfo(skb)->frags[i];
1040 }
1041 } else if (i < MAX_SKB_FRAGS) {
1042 if (copy > PAGE_SIZE)
1043 copy = PAGE_SIZE;
1044 page = alloc_pages(sk->sk_allocation, 0);
1045 if (page == NULL) {
1046 err = -ENOMEM;
1047 goto error;
1048 }
1049 sk->sk_sndmsg_page = page;
1050 sk->sk_sndmsg_off = 0;
1051
1052 skb_fill_page_desc(skb, i, page, 0, 0);
1053 frag = &skb_shinfo(skb)->frags[i];
1da177e4
LT
1054 } else {
1055 err = -EMSGSIZE;
1056 goto error;
1057 }
1058 if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1059 err = -EFAULT;
1060 goto error;
1061 }
1062 sk->sk_sndmsg_off += copy;
1063 frag->size += copy;
1064 skb->len += copy;
1065 skb->data_len += copy;
f945fa7a
HX
1066 skb->truesize += copy;
1067 atomic_add(copy, &sk->sk_wmem_alloc);
1da177e4
LT
1068 }
1069 offset += copy;
1070 length -= copy;
1071 }
1072
1073 return 0;
1074
1075error:
1076 inet->cork.length -= length;
5e38e270 1077 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
e905a9ed 1078 return err;
1da177e4
LT
1079}
1080
1081ssize_t ip_append_page(struct sock *sk, struct page *page,
1082 int offset, size_t size, int flags)
1083{
1084 struct inet_sock *inet = inet_sk(sk);
1085 struct sk_buff *skb;
1086 struct rtable *rt;
1087 struct ip_options *opt = NULL;
1088 int hh_len;
1089 int mtu;
1090 int len;
1091 int err;
1092 unsigned int maxfraglen, fragheaderlen, fraggap;
1093
1094 if (inet->hdrincl)
1095 return -EPERM;
1096
1097 if (flags&MSG_PROBE)
1098 return 0;
1099
1100 if (skb_queue_empty(&sk->sk_write_queue))
1101 return -EINVAL;
1102
c8cdaf99 1103 rt = (struct rtable *)inet->cork.dst;
1da177e4
LT
1104 if (inet->cork.flags & IPCORK_OPT)
1105 opt = inet->cork.opt;
1106
d8d1f30b 1107 if (!(rt->dst.dev->features&NETIF_F_SG))
1da177e4
LT
1108 return -EOPNOTSUPP;
1109
d8d1f30b 1110 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1da177e4
LT
1111 mtu = inet->cork.fragsize;
1112
1113 fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1114 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1115
1116 if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
c720c7e8 1117 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->inet_dport, mtu);
1da177e4
LT
1118 return -EMSGSIZE;
1119 }
1120
1121 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1122 return -EINVAL;
1123
1124 inet->cork.length += size;
26cde9f7
HX
1125 if ((size + skb->len > mtu) &&
1126 (sk->sk_protocol == IPPROTO_UDP) &&
d8d1f30b 1127 (rt->dst.dev->features & NETIF_F_UFO)) {
7967168c 1128 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
f83ef8c0 1129 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
7967168c 1130 }
e89e9cf5 1131
1da177e4
LT
1132
1133 while (size > 0) {
1134 int i;
1135
89114afd 1136 if (skb_is_gso(skb))
e89e9cf5
AR
1137 len = size;
1138 else {
1139
1140 /* Check if the remaining data fits into current packet. */
1141 len = mtu - skb->len;
1142 if (len < size)
1143 len = maxfraglen - skb->len;
1144 }
1da177e4
LT
1145 if (len <= 0) {
1146 struct sk_buff *skb_prev;
1da177e4
LT
1147 int alloclen;
1148
1149 skb_prev = skb;
0d0d2bba 1150 fraggap = skb_prev->len - maxfraglen;
1da177e4
LT
1151
1152 alloclen = fragheaderlen + hh_len + fraggap + 15;
1153 skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1154 if (unlikely(!skb)) {
1155 err = -ENOBUFS;
1156 goto error;
1157 }
1158
1159 /*
1160 * Fill in the control structures
1161 */
1162 skb->ip_summed = CHECKSUM_NONE;
1163 skb->csum = 0;
1164 skb_reserve(skb, hh_len);
1165
1166 /*
1167 * Find where to start putting bytes.
1168 */
967b05f6 1169 skb_put(skb, fragheaderlen + fraggap);
2ca9e6f2 1170 skb_reset_network_header(skb);
b0e380b1
ACM
1171 skb->transport_header = (skb->network_header +
1172 fragheaderlen);
1da177e4 1173 if (fraggap) {
967b05f6
ACM
1174 skb->csum = skb_copy_and_csum_bits(skb_prev,
1175 maxfraglen,
9c70220b 1176 skb_transport_header(skb),
967b05f6 1177 fraggap, 0);
1da177e4
LT
1178 skb_prev->csum = csum_sub(skb_prev->csum,
1179 skb->csum);
e9fa4f7b 1180 pskb_trim_unique(skb_prev, maxfraglen);
1da177e4
LT
1181 }
1182
1183 /*
1184 * Put the packet on the pending queue.
1185 */
1186 __skb_queue_tail(&sk->sk_write_queue, skb);
1187 continue;
1188 }
1189
1190 i = skb_shinfo(skb)->nr_frags;
1191 if (len > size)
1192 len = size;
1193 if (skb_can_coalesce(skb, i, page, offset)) {
1194 skb_shinfo(skb)->frags[i-1].size += len;
1195 } else if (i < MAX_SKB_FRAGS) {
1196 get_page(page);
1197 skb_fill_page_desc(skb, i, page, offset, len);
1198 } else {
1199 err = -EMSGSIZE;
1200 goto error;
1201 }
1202
1203 if (skb->ip_summed == CHECKSUM_NONE) {
44bb9363 1204 __wsum csum;
1da177e4
LT
1205 csum = csum_page(page, offset, len);
1206 skb->csum = csum_block_add(skb->csum, csum, skb->len);
1207 }
1208
1209 skb->len += len;
1210 skb->data_len += len;
1e34a11d
DM
1211 skb->truesize += len;
1212 atomic_add(len, &sk->sk_wmem_alloc);
1da177e4
LT
1213 offset += len;
1214 size -= len;
1215 }
1216 return 0;
1217
1218error:
1219 inet->cork.length -= size;
5e38e270 1220 IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1221 return err;
1222}
1223
429f08e9
PE
1224static void ip_cork_release(struct inet_sock *inet)
1225{
1226 inet->cork.flags &= ~IPCORK_OPT;
1227 kfree(inet->cork.opt);
1228 inet->cork.opt = NULL;
c8cdaf99
YH
1229 dst_release(inet->cork.dst);
1230 inet->cork.dst = NULL;
429f08e9
PE
1231}
1232
1da177e4
LT
1233/*
1234 * Combined all pending IP fragments on the socket as one IP datagram
1235 * and push them out.
1236 */
1237int ip_push_pending_frames(struct sock *sk)
1238{
1239 struct sk_buff *skb, *tmp_skb;
1240 struct sk_buff **tail_skb;
1241 struct inet_sock *inet = inet_sk(sk);
0388b004 1242 struct net *net = sock_net(sk);
1da177e4 1243 struct ip_options *opt = NULL;
c8cdaf99 1244 struct rtable *rt = (struct rtable *)inet->cork.dst;
1da177e4 1245 struct iphdr *iph;
76ab608d 1246 __be16 df = 0;
1da177e4
LT
1247 __u8 ttl;
1248 int err = 0;
1249
1250 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1251 goto out;
1252 tail_skb = &(skb_shinfo(skb)->frag_list);
1253
1254 /* move skb->data to ip header from ext header */
d56f90a7 1255 if (skb->data < skb_network_header(skb))
bbe735e4 1256 __skb_pull(skb, skb_network_offset(skb));
1da177e4 1257 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
cfe1fc77 1258 __skb_pull(tmp_skb, skb_network_header_len(skb));
1da177e4
LT
1259 *tail_skb = tmp_skb;
1260 tail_skb = &(tmp_skb->next);
1261 skb->len += tmp_skb->len;
1262 skb->data_len += tmp_skb->len;
1263 skb->truesize += tmp_skb->truesize;
1da177e4
LT
1264 tmp_skb->destructor = NULL;
1265 tmp_skb->sk = NULL;
1266 }
1267
1268 /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1269 * to fragment the frame generated here. No matter, what transforms
1270 * how transforms change size of the packet, it will come out.
1271 */
628a5c56 1272 if (inet->pmtudisc < IP_PMTUDISC_DO)
1da177e4
LT
1273 skb->local_df = 1;
1274
1275 /* DF bit is set when we want to see DF on outgoing frames.
1276 * If local_df is set too, we still allow to fragment this frame
1277 * locally. */
628a5c56 1278 if (inet->pmtudisc >= IP_PMTUDISC_DO ||
d8d1f30b
CG
1279 (skb->len <= dst_mtu(&rt->dst) &&
1280 ip_dont_fragment(sk, &rt->dst)))
1da177e4
LT
1281 df = htons(IP_DF);
1282
1283 if (inet->cork.flags & IPCORK_OPT)
1284 opt = inet->cork.opt;
1285
1286 if (rt->rt_type == RTN_MULTICAST)
1287 ttl = inet->mc_ttl;
1288 else
d8d1f30b 1289 ttl = ip_select_ttl(inet, &rt->dst);
1da177e4
LT
1290
1291 iph = (struct iphdr *)skb->data;
1292 iph->version = 4;
1293 iph->ihl = 5;
1294 if (opt) {
1295 iph->ihl += opt->optlen>>2;
1296 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1297 }
1298 iph->tos = inet->tos;
1da177e4 1299 iph->frag_off = df;
d8d1f30b 1300 ip_select_ident(iph, &rt->dst, sk);
1da177e4
LT
1301 iph->ttl = ttl;
1302 iph->protocol = sk->sk_protocol;
1303 iph->saddr = rt->rt_src;
1304 iph->daddr = rt->rt_dst;
1da177e4
LT
1305
1306 skb->priority = sk->sk_priority;
4a19ec58 1307 skb->mark = sk->sk_mark;
a21bba94
ED
1308 /*
1309 * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec
1310 * on dst refcount
1311 */
1312 inet->cork.dst = NULL;
d8d1f30b 1313 skb_dst_set(skb, &rt->dst);
1da177e4 1314
96793b48 1315 if (iph->protocol == IPPROTO_ICMP)
0388b004 1316 icmp_out_count(net, ((struct icmphdr *)
96793b48
DS
1317 skb_transport_header(skb))->type);
1318
1da177e4 1319 /* Netfilter gets whole the not fragmented skb. */
c439cb2e 1320 err = ip_local_out(skb);
1da177e4
LT
1321 if (err) {
1322 if (err > 0)
6ce9e7b5 1323 err = net_xmit_errno(err);
1da177e4
LT
1324 if (err)
1325 goto error;
1326 }
1327
1328out:
429f08e9 1329 ip_cork_release(inet);
1da177e4
LT
1330 return err;
1331
1332error:
5e38e270 1333 IP_INC_STATS(net, IPSTATS_MIB_OUTDISCARDS);
1da177e4
LT
1334 goto out;
1335}
1336
1337/*
1338 * Throw away all pending data on the socket.
1339 */
1340void ip_flush_pending_frames(struct sock *sk)
1341{
1da177e4
LT
1342 struct sk_buff *skb;
1343
1344 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1345 kfree_skb(skb);
1346
429f08e9 1347 ip_cork_release(inet_sk(sk));
1da177e4
LT
1348}
1349
1350
1351/*
1352 * Fetch data from kernel space and fill in checksum if needed.
1353 */
e905a9ed 1354static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1da177e4
LT
1355 int len, int odd, struct sk_buff *skb)
1356{
5084205f 1357 __wsum csum;
1da177e4
LT
1358
1359 csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1360 skb->csum = csum_block_add(skb->csum, csum, odd);
e905a9ed 1361 return 0;
1da177e4
LT
1362}
1363
e905a9ed 1364/*
1da177e4
LT
1365 * Generic function to send a packet as reply to another packet.
1366 * Used to send TCP resets so far. ICMP should use this function too.
1367 *
e905a9ed 1368 * Should run single threaded per socket because it uses the sock
1da177e4 1369 * structure to pass arguments.
1da177e4
LT
1370 */
1371void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1372 unsigned int len)
1373{
1374 struct inet_sock *inet = inet_sk(sk);
1375 struct {
1376 struct ip_options opt;
1377 char data[40];
1378 } replyopts;
1379 struct ipcm_cookie ipc;
3ca3c68e 1380 __be32 daddr;
511c3f92 1381 struct rtable *rt = skb_rtable(skb);
1da177e4
LT
1382
1383 if (ip_options_echo(&replyopts.opt, skb))
1384 return;
1385
1386 daddr = ipc.addr = rt->rt_src;
1387 ipc.opt = NULL;
51f31cab 1388 ipc.shtx.flags = 0;
1da177e4
LT
1389
1390 if (replyopts.opt.optlen) {
1391 ipc.opt = &replyopts.opt;
1392
1393 if (ipc.opt->srr)
1394 daddr = replyopts.opt.faddr;
1395 }
1396
1397 {
f0e48dbf
PM
1398 struct flowi fl = { .oif = arg->bound_dev_if,
1399 .nl_u = { .ip4_u =
1da177e4
LT
1400 { .daddr = daddr,
1401 .saddr = rt->rt_spec_dst,
eddc9ec5 1402 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1da177e4
LT
1403 /* Not quite clean, but right. */
1404 .uli_u = { .ports =
aa8223c7
ACM
1405 { .sport = tcp_hdr(skb)->dest,
1406 .dport = tcp_hdr(skb)->source } },
86b08d86
KK
1407 .proto = sk->sk_protocol,
1408 .flags = ip_reply_arg_flowi_flags(arg) };
beb8d13b 1409 security_skb_classify_flow(skb, &fl);
3b1e0a65 1410 if (ip_route_output_key(sock_net(sk), &rt, &fl))
1da177e4
LT
1411 return;
1412 }
1413
1414 /* And let IP do all the hard work.
1415
1416 This chunk is not reenterable, hence spinlock.
1417 Note that it uses the fact, that this function is called
1418 with locally disabled BH and that sk cannot be already spinlocked.
1419 */
1420 bh_lock_sock(sk);
eddc9ec5 1421 inet->tos = ip_hdr(skb)->tos;
1da177e4 1422 sk->sk_priority = skb->priority;
eddc9ec5 1423 sk->sk_protocol = ip_hdr(skb)->protocol;
f0e48dbf 1424 sk->sk_bound_dev_if = arg->bound_dev_if;
1da177e4 1425 ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
2e77d89b 1426 &ipc, &rt, MSG_DONTWAIT);
1da177e4
LT
1427 if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1428 if (arg->csumoffset >= 0)
9c70220b
ACM
1429 *((__sum16 *)skb_transport_header(skb) +
1430 arg->csumoffset) = csum_fold(csum_add(skb->csum,
1431 arg->csum));
1da177e4
LT
1432 skb->ip_summed = CHECKSUM_NONE;
1433 ip_push_pending_frames(sk);
1434 }
1435
1436 bh_unlock_sock(sk);
1437
1438 ip_rt_put(rt);
1439}
1440
1da177e4
LT
1441void __init ip_init(void)
1442{
1da177e4
LT
1443 ip_rt_init();
1444 inet_initpeers();
1445
1446#if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1447 igmp_mc_proc_init();
1448#endif
1449}
1450
1da177e4
LT
1451EXPORT_SYMBOL(ip_generic_getfrag);
1452EXPORT_SYMBOL(ip_queue_xmit);
1453EXPORT_SYMBOL(ip_send_check);