net/ipv4/ip_output.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              The Internet Protocol (IP) output module.
   7  *
   8  * Version:     $Id: ip_output.c,v 1.100 2002/02/01 22:01:03 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Donald Becker, <becker@super.org>
  13  *              Alan Cox, <Alan.Cox@linux.org>
  14  *              Richard Underwood
  15  *              Stefan Becker, <stefanb@yello.ping.de>
  16  *              Jorge Cwik, <jorge@laser.satlink.net>
  17  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  18  *              Hirokazu Takahashi, <taka@valinux.co.jp>
  19  *
  20  *      See ip_input.c for original log
  21  *
  22  *      Fixes:
  23  *              Alan Cox        :       Missing nonblock feature in ip_build_xmit.
  24  *              Mike Kilburn    :       htons() missing in ip_build_xmit.
  25  *              Bradford Johnson:       Fix faulty handling of some frames when
  26  *                                      no route is found.
  27  *              Alexander Demenshin:    Missing sk/skb free in ip_queue_xmit
  28  *                                      (in case if packet not accepted by
  29  *                                      output firewall rules)
  30  *              Mike McLagan    :       Routing by source
  31  *              Alexey Kuznetsov:       use new route cache
  32  *              Andi Kleen:             Fix broken PMTU recovery and remove
  33  *                                      some redundant tests.
  34  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  35  *              Andi Kleen      :       Replace ip_reply with ip_send_reply.
  36  *              Andi Kleen      :       Split fast and slow ip_build_xmit path
  37  *                                      for decreased register pressure on x86
  38  *                                      and more readibility.
  39  *              Marc Boucher    :       When call_out_firewall returns FW_QUEUE,
  40  *                                      silently drop skb instead of failing with -EPERM.
  41  *              Detlev Wengorz  :       Copy protocol for fragments.
  42  *              Hirokazu Takahashi:     HW checksumming for outgoing UDP
  43  *                                      datagrams.
  44  *              Hirokazu Takahashi:     sendfile() on UDP works now.
  45  */
  46
  47 #include <asm/uaccess.h>
  48 #include <asm/system.h>
  49 #include <linux/module.h>
  50 #include <linux/types.h>
  51 #include <linux/kernel.h>
  52 #include <linux/mm.h>
  53 #include <linux/string.h>
  54 #include <linux/errno.h>
  55 #include <linux/highmem.h>
  56
  57 #include <linux/socket.h>
  58 #include <linux/sockios.h>
  59 #include <linux/in.h>
  60 #include <linux/inet.h>
  61 #include <linux/netdevice.h>
  62 #include <linux/etherdevice.h>
  63 #include <linux/proc_fs.h>
  64 #include <linux/stat.h>
  65 #include <linux/init.h>
  66
  67 #include <net/snmp.h>
  68 #include <net/ip.h>
  69 #include <net/protocol.h>
  70 #include <net/route.h>
  71 #include <net/xfrm.h>
  72 #include <linux/skbuff.h>
  73 #include <net/sock.h>
  74 #include <net/arp.h>
  75 #include <net/icmp.h>
  76 #include <net/checksum.h>
  77 #include <net/inetpeer.h>
  78 #include <linux/igmp.h>
  79 #include <linux/netfilter_ipv4.h>
  80 #include <linux/netfilter_bridge.h>
  81 #include <linux/mroute.h>
  82 #include <linux/netlink.h>
  83 #include <linux/tcp.h>
  84
  85 int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
  86
  87 /* Generate a checksum for an outgoing IP datagram. */
  88 __inline__ void ip_send_check(struct iphdr *iph)
  89 {
  90         iph->check = 0;
  91         iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
  92 }
  93
  94 int __ip_local_out(struct sk_buff *skb)
  95 {
  96         struct iphdr *iph = ip_hdr(skb);
  97
  98         iph->tot_len = htons(skb->len);
  99         ip_send_check(iph);
 100         return nf_hook(PF_INET, NF_INET_LOCAL_OUT, skb, NULL, skb->dst->dev,
 101                        dst_output);
 102 }
 103
 104 int ip_local_out(struct sk_buff *skb)
 105 {
 106         int err;
 107
 108         err = __ip_local_out(skb);
 109         if (likely(err == 1))
 110                 err = dst_output(skb);
 111
 112         return err;
 113 }
 114 EXPORT_SYMBOL_GPL(ip_local_out);
 115
 116 /* dev_loopback_xmit for use with netfilter. */
 117 static int ip_dev_loopback_xmit(struct sk_buff *newskb)
 118 {
 119         skb_reset_mac_header(newskb);
 120         __skb_pull(newskb, skb_network_offset(newskb));
 121         newskb->pkt_type = PACKET_LOOPBACK;
 122         newskb->ip_summed = CHECKSUM_UNNECESSARY;
 123         BUG_TRAP(newskb->dst);
 124         netif_rx(newskb);
 125         return 0;
 126 }
 127
 128 static inline int ip_select_ttl(struct inet_sock *inet, struct dst_entry *dst)
 129 {
 130         int ttl = inet->uc_ttl;
 131
 132         if (ttl < 0)
 133                 ttl = dst_metric(dst, RTAX_HOPLIMIT);
 134         return ttl;
 135 }
 136
 137 /*
 138  *              Add an ip header to a skbuff and send it out.
 139  *
 140  */
 141 int ip_build_and_send_pkt(struct sk_buff *skb, struct sock *sk,
 142                           __be32 saddr, __be32 daddr, struct ip_options *opt)
 143 {
 144         struct inet_sock *inet = inet_sk(sk);
 145         struct rtable *rt = (struct rtable *)skb->dst;
 146         struct iphdr *iph;
 147
 148         /* Build the IP header. */
 149         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 150         skb_reset_network_header(skb);
 151         iph = ip_hdr(skb);
 152         iph->version  = 4;
 153         iph->ihl      = 5;
 154         iph->tos      = inet->tos;
 155         if (ip_dont_fragment(sk, &rt->u.dst))
 156                 iph->frag_off = htons(IP_DF);
 157         else
 158                 iph->frag_off = 0;
 159         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 160         iph->daddr    = rt->rt_dst;
 161         iph->saddr    = rt->rt_src;
 162         iph->protocol = sk->sk_protocol;
 163         ip_select_ident(iph, &rt->u.dst, sk);
 164
 165         if (opt && opt->optlen) {
 166                 iph->ihl += opt->optlen>>2;
 167                 ip_options_build(skb, opt, daddr, rt, 0);
 168         }
 169
 170         skb->priority = sk->sk_priority;
 171
 172         /* Send it out. */
 173         return ip_local_out(skb);
 174 }
 175
 176 EXPORT_SYMBOL_GPL(ip_build_and_send_pkt);
 177
 178 static inline int ip_finish_output2(struct sk_buff *skb)
 179 {
 180         struct dst_entry *dst = skb->dst;
 181         struct rtable *rt = (struct rtable *)dst;
 182         struct net_device *dev = dst->dev;
 183         unsigned int hh_len = LL_RESERVED_SPACE(dev);
 184
 185         if (rt->rt_type == RTN_MULTICAST)
 186                 IP_INC_STATS(IPSTATS_MIB_OUTMCASTPKTS);
 187         else if (rt->rt_type == RTN_BROADCAST)
 188                 IP_INC_STATS(IPSTATS_MIB_OUTBCASTPKTS);
 189
 190         /* Be paranoid, rather than too clever. */
 191         if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
 192                 struct sk_buff *skb2;
 193
 194                 skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
 195                 if (skb2 == NULL) {
 196                         kfree_skb(skb);
 197                         return -ENOMEM;
 198                 }
 199                 if (skb->sk)
 200                         skb_set_owner_w(skb2, skb->sk);
 201                 kfree_skb(skb);
 202                 skb = skb2;
 203         }
 204
 205         if (dst->hh)
 206                 return neigh_hh_output(dst->hh, skb);
 207         else if (dst->neighbour)
 208                 return dst->neighbour->output(skb);
 209
 210         if (net_ratelimit())
 211                 printk(KERN_DEBUG "ip_finish_output2: No header cache and no neighbour!\n");
 212         kfree_skb(skb);
 213         return -EINVAL;
 214 }
 215
 216 static inline int ip_skb_dst_mtu(struct sk_buff *skb)
 217 {
 218         struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
 219
 220         return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
 221                skb->dst->dev->mtu : dst_mtu(skb->dst);
 222 }
 223
 224 static int ip_finish_output(struct sk_buff *skb)
 225 {
 226 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
 227         /* Policy lookup after SNAT yielded a new policy */
 228         if (skb->dst->xfrm != NULL) {
 229                 IPCB(skb)->flags |= IPSKB_REROUTED;
 230                 return dst_output(skb);
 231         }
 232 #endif
 233         if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
 234                 return ip_fragment(skb, ip_finish_output2);
 235         else
 236                 return ip_finish_output2(skb);
 237 }
 238
 239 int ip_mc_output(struct sk_buff *skb)
 240 {
 241         struct sock *sk = skb->sk;
 242         struct rtable *rt = (struct rtable*)skb->dst;
 243         struct net_device *dev = rt->u.dst.dev;
 244
 245         /*
 246          *      If the indicated interface is up and running, send the packet.
 247          */
 248         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 249
 250         skb->dev = dev;
 251         skb->protocol = htons(ETH_P_IP);
 252
 253         /*
 254          *      Multicasts are looped back for other local users
 255          */
 256
 257         if (rt->rt_flags&RTCF_MULTICAST) {
 258                 if ((!sk || inet_sk(sk)->mc_loop)
 259 #ifdef CONFIG_IP_MROUTE
 260                 /* Small optimization: do not loopback not local frames,
 261                    which returned after forwarding; they will be  dropped
 262                    by ip_mr_input in any case.
 263                    Note, that local frames are looped back to be delivered
 264                    to local recipients.
 265
 266                    This check is duplicated in ip_mr_input at the moment.
 267                  */
 268                     && ((rt->rt_flags&RTCF_LOCAL) || !(IPCB(skb)->flags&IPSKB_FORWARDED))
 269 #endif
 270                 ) {
 271                         struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 272                         if (newskb)
 273                                 NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb,
 274                                         NULL, newskb->dev,
 275                                         ip_dev_loopback_xmit);
 276                 }
 277
 278                 /* Multicasts with ttl 0 must not go beyond the host */
 279
 280                 if (ip_hdr(skb)->ttl == 0) {
 281                         kfree_skb(skb);
 282                         return 0;
 283                 }
 284         }
 285
 286         if (rt->rt_flags&RTCF_BROADCAST) {
 287                 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 288                 if (newskb)
 289                         NF_HOOK(PF_INET, NF_INET_POST_ROUTING, newskb, NULL,
 290                                 newskb->dev, ip_dev_loopback_xmit);
 291         }
 292
 293         return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, skb->dev,
 294                             ip_finish_output,
 295                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 296 }
 297
 298 int ip_output(struct sk_buff *skb)
 299 {
 300         struct net_device *dev = skb->dst->dev;
 301
 302         IP_INC_STATS(IPSTATS_MIB_OUTREQUESTS);
 303
 304         skb->dev = dev;
 305         skb->protocol = htons(ETH_P_IP);
 306
 307         return NF_HOOK_COND(PF_INET, NF_INET_POST_ROUTING, skb, NULL, dev,
 308                             ip_finish_output,
 309                             !(IPCB(skb)->flags & IPSKB_REROUTED));
 310 }
 311
 312 int ip_queue_xmit(struct sk_buff *skb, int ipfragok)
 313 {
 314         struct sock *sk = skb->sk;
 315         struct inet_sock *inet = inet_sk(sk);
 316         struct ip_options *opt = inet->opt;
 317         struct rtable *rt;
 318         struct iphdr *iph;
 319
 320         /* Skip all of this if the packet is already routed,
 321          * f.e. by something like SCTP.
 322          */
 323         rt = (struct rtable *) skb->dst;
 324         if (rt != NULL)
 325                 goto packet_routed;
 326
 327         /* Make sure we can route this packet. */
 328         rt = (struct rtable *)__sk_dst_check(sk, 0);
 329         if (rt == NULL) {
 330                 __be32 daddr;
 331
 332                 /* Use correct destination address if we have options. */
 333                 daddr = inet->daddr;
 334                 if(opt && opt->srr)
 335                         daddr = opt->faddr;
 336
 337                 {
 338                         struct flowi fl = { .oif = sk->sk_bound_dev_if,
 339                                             .nl_u = { .ip4_u =
 340                                                       { .daddr = daddr,
 341                                                         .saddr = inet->saddr,
 342                                                         .tos = RT_CONN_FLAGS(sk) } },
 343                                             .proto = sk->sk_protocol,
 344                                             .uli_u = { .ports =
 345                                                        { .sport = inet->sport,
 346                                                          .dport = inet->dport } } };
 347
 348                         /* If this fails, retransmit mechanism of transport layer will
 349                          * keep trying until route appears or the connection times
 350                          * itself out.
 351                          */
 352                         security_sk_classify_flow(sk, &fl);
 353                         if (ip_route_output_flow(&init_net, &rt, &fl, sk, 0))
 354                                 goto no_route;
 355                 }
 356                 sk_setup_caps(sk, &rt->u.dst);
 357         }
 358         skb->dst = dst_clone(&rt->u.dst);
 359
 360 packet_routed:
 361         if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway)
 362                 goto no_route;
 363
 364         /* OK, we know where to send it, allocate and build IP header. */
 365         skb_push(skb, sizeof(struct iphdr) + (opt ? opt->optlen : 0));
 366         skb_reset_network_header(skb);
 367         iph = ip_hdr(skb);
 368         *((__be16 *)iph) = htons((4 << 12) | (5 << 8) | (inet->tos & 0xff));
 369         if (ip_dont_fragment(sk, &rt->u.dst) && !ipfragok)
 370                 iph->frag_off = htons(IP_DF);
 371         else
 372                 iph->frag_off = 0;
 373         iph->ttl      = ip_select_ttl(inet, &rt->u.dst);
 374         iph->protocol = sk->sk_protocol;
 375         iph->saddr    = rt->rt_src;
 376         iph->daddr    = rt->rt_dst;
 377         /* Transport layer set skb->h.foo itself. */
 378
 379         if (opt && opt->optlen) {
 380                 iph->ihl += opt->optlen >> 2;
 381                 ip_options_build(skb, opt, inet->daddr, rt, 0);
 382         }
 383
 384         ip_select_ident_more(iph, &rt->u.dst, sk,
 385                              (skb_shinfo(skb)->gso_segs ?: 1) - 1);
 386
 387         skb->priority = sk->sk_priority;
 388
 389         return ip_local_out(skb);
 390
 391 no_route:
 392         IP_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 393         kfree_skb(skb);
 394         return -EHOSTUNREACH;
 395 }
 396
 397
 398 static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from)
 399 {
 400         to->pkt_type = from->pkt_type;
 401         to->priority = from->priority;
 402         to->protocol = from->protocol;
 403         dst_release(to->dst);
 404         to->dst = dst_clone(from->dst);
 405         to->dev = from->dev;
 406         to->mark = from->mark;
 407
 408         /* Copy the flags to each fragment. */
 409         IPCB(to)->flags = IPCB(from)->flags;
 410
 411 #ifdef CONFIG_NET_SCHED
 412         to->tc_index = from->tc_index;
 413 #endif
 414         nf_copy(to, from);
 415 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
 416     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
 417         to->nf_trace = from->nf_trace;
 418 #endif
 419 #if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
 420         to->ipvs_property = from->ipvs_property;
 421 #endif
 422         skb_copy_secmark(to, from);
 423 }
 424
 425 /*
 426  *      This IP datagram is too large to be sent in one piece.  Break it up into
 427  *      smaller pieces (each of size equal to IP header plus
 428  *      a block of the data of the original IP data part) that will yet fit in a
 429  *      single device frame, and queue such a frame for sending.
 430  */
 431
 432 int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 433 {
 434         struct iphdr *iph;
 435         int raw = 0;
 436         int ptr;
 437         struct net_device *dev;
 438         struct sk_buff *skb2;
 439         unsigned int mtu, hlen, left, len, ll_rs, pad;
 440         int offset;
 441         __be16 not_last_frag;
 442         struct rtable *rt = (struct rtable*)skb->dst;
 443         int err = 0;
 444
 445         dev = rt->u.dst.dev;
 446
 447         /*
 448          *      Point into the IP datagram header.
 449          */
 450
 451         iph = ip_hdr(skb);
 452
 453         if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
 454                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 455                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
 456                           htonl(ip_skb_dst_mtu(skb)));
 457                 kfree_skb(skb);
 458                 return -EMSGSIZE;
 459         }
 460
 461         /*
 462          *      Setup starting values.
 463          */
 464
 465         hlen = iph->ihl * 4;
 466         mtu = dst_mtu(&rt->u.dst) - hlen;       /* Size of data space */
 467         IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
 468
 469         /* When frag_list is given, use it. First, check its validity:
 470          * some transformers could create wrong frag_list or break existing
 471          * one, it is not prohibited. In this case fall back to copying.
 472          *
 473          * LATER: this step can be merged to real generation of fragments,
 474          * we can switch to copy when see the first bad fragment.
 475          */
 476         if (skb_shinfo(skb)->frag_list) {
 477                 struct sk_buff *frag;
 478                 int first_len = skb_pagelen(skb);
 479                 int truesizes = 0;
 480
 481                 if (first_len - hlen > mtu ||
 482                     ((first_len - hlen) & 7) ||
 483                     (iph->frag_off & htons(IP_MF|IP_OFFSET)) ||
 484                     skb_cloned(skb))
 485                         goto slow_path;
 486
 487                 for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) {
 488                         /* Correct geometry. */
 489                         if (frag->len > mtu ||
 490                             ((frag->len & 7) && frag->next) ||
 491                             skb_headroom(frag) < hlen)
 492                             goto slow_path;
 493
 494                         /* Partially cloned skb? */
 495                         if (skb_shared(frag))
 496                                 goto slow_path;
 497
 498                         BUG_ON(frag->sk);
 499                         if (skb->sk) {
 500                                 sock_hold(skb->sk);
 501                                 frag->sk = skb->sk;
 502                                 frag->destructor = sock_wfree;
 503                                 truesizes += frag->truesize;
 504                         }
 505                 }
 506
 507                 /* Everything is OK. Generate! */
 508
 509                 err = 0;
 510                 offset = 0;
 511                 frag = skb_shinfo(skb)->frag_list;
 512                 skb_shinfo(skb)->frag_list = NULL;
 513                 skb->data_len = first_len - skb_headlen(skb);
 514                 skb->truesize -= truesizes;
 515                 skb->len = first_len;
 516                 iph->tot_len = htons(first_len);
 517                 iph->frag_off = htons(IP_MF);
 518                 ip_send_check(iph);
 519
 520                 for (;;) {
 521                         /* Prepare header of the next frame,
 522                          * before previous one went down. */
 523                         if (frag) {
 524                                 frag->ip_summed = CHECKSUM_NONE;
 525                                 skb_reset_transport_header(frag);
 526                                 __skb_push(frag, hlen);
 527                                 skb_reset_network_header(frag);
 528                                 memcpy(skb_network_header(frag), iph, hlen);
 529                                 iph = ip_hdr(frag);
 530                                 iph->tot_len = htons(frag->len);
 531                                 ip_copy_metadata(frag, skb);
 532                                 if (offset == 0)
 533                                         ip_options_fragment(frag);
 534                                 offset += skb->len - hlen;
 535                                 iph->frag_off = htons(offset>>3);
 536                                 if (frag->next != NULL)
 537                                         iph->frag_off |= htons(IP_MF);
 538                                 /* Ready, complete checksum */
 539                                 ip_send_check(iph);
 540                         }
 541
 542                         err = output(skb);
 543
 544                         if (!err)
 545                                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 546                         if (err || !frag)
 547                                 break;
 548
 549                         skb = frag;
 550                         frag = skb->next;
 551                         skb->next = NULL;
 552                 }
 553
 554                 if (err == 0) {
 555                         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 556                         return 0;
 557                 }
 558
 559                 while (frag) {
 560                         skb = frag->next;
 561                         kfree_skb(frag);
 562                         frag = skb;
 563                 }
 564                 IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 565                 return err;
 566         }
 567
 568 slow_path:
 569         left = skb->len - hlen;         /* Space per frame */
 570         ptr = raw + hlen;               /* Where to start from */
 571
 572         /* for bridged IP traffic encapsulated inside f.e. a vlan header,
 573          * we need to make room for the encapsulating header
 574          */
 575         pad = nf_bridge_pad(skb);
 576         ll_rs = LL_RESERVED_SPACE_EXTRA(rt->u.dst.dev, pad);
 577         mtu -= pad;
 578
 579         /*
 580          *      Fragment the datagram.
 581          */
 582
 583         offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 584         not_last_frag = iph->frag_off & htons(IP_MF);
 585
 586         /*
 587          *      Keep copying data until we run out.
 588          */
 589
 590         while (left > 0) {
 591                 len = left;
 592                 /* IF: it doesn't fit, use 'mtu' - the data space left */
 593                 if (len > mtu)
 594                         len = mtu;
 595                 /* IF: we are not sending upto and including the packet end
 596                    then align the next start on an eight byte boundary */
 597                 if (len < left) {
 598                         len &= ~7;
 599                 }
 600                 /*
 601                  *      Allocate buffer.
 602                  */
 603
 604                 if ((skb2 = alloc_skb(len+hlen+ll_rs, GFP_ATOMIC)) == NULL) {
 605                         NETDEBUG(KERN_INFO "IP: frag: no memory for new fragment!\n");
 606                         err = -ENOMEM;
 607                         goto fail;
 608                 }
 609
 610                 /*
 611                  *      Set up data on packet
 612                  */
 613
 614                 ip_copy_metadata(skb2, skb);
 615                 skb_reserve(skb2, ll_rs);
 616                 skb_put(skb2, len + hlen);
 617                 skb_reset_network_header(skb2);
 618                 skb2->transport_header = skb2->network_header + hlen;
 619
 620                 /*
 621                  *      Charge the memory for the fragment to any owner
 622                  *      it might possess
 623                  */
 624
 625                 if (skb->sk)
 626                         skb_set_owner_w(skb2, skb->sk);
 627
 628                 /*
 629                  *      Copy the packet header into the new buffer.
 630                  */
 631
 632                 skb_copy_from_linear_data(skb, skb_network_header(skb2), hlen);
 633
 634                 /*
 635                  *      Copy a block of the IP datagram.
 636                  */
 637                 if (skb_copy_bits(skb, ptr, skb_transport_header(skb2), len))
 638                         BUG();
 639                 left -= len;
 640
 641                 /*
 642                  *      Fill in the new header fields.
 643                  */
 644                 iph = ip_hdr(skb2);
 645                 iph->frag_off = htons((offset >> 3));
 646
 647                 /* ANK: dirty, but effective trick. Upgrade options only if
 648                  * the segment to be fragmented was THE FIRST (otherwise,
 649                  * options are already fixed) and make it ONCE
 650                  * on the initial skb, so that all the following fragments
 651                  * will inherit fixed options.
 652                  */
 653                 if (offset == 0)
 654                         ip_options_fragment(skb);
 655
 656                 /*
 657                  *      Added AC : If we are fragmenting a fragment that's not the
 658                  *                 last fragment then keep MF on each bit
 659                  */
 660                 if (left > 0 || not_last_frag)
 661                         iph->frag_off |= htons(IP_MF);
 662                 ptr += len;
 663                 offset += len;
 664
 665                 /*
 666                  *      Put this fragment into the sending queue.
 667                  */
 668                 iph->tot_len = htons(len + hlen);
 669
 670                 ip_send_check(iph);
 671
 672                 err = output(skb2);
 673                 if (err)
 674                         goto fail;
 675
 676                 IP_INC_STATS(IPSTATS_MIB_FRAGCREATES);
 677         }
 678         kfree_skb(skb);
 679         IP_INC_STATS(IPSTATS_MIB_FRAGOKS);
 680         return err;
 681
 682 fail:
 683         kfree_skb(skb);
 684         IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
 685         return err;
 686 }
 687
 688 EXPORT_SYMBOL(ip_fragment);
 689
 690 int
 691 ip_generic_getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb)
 692 {
 693         struct iovec *iov = from;
 694
 695         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 696                 if (memcpy_fromiovecend(to, iov, offset, len) < 0)
 697                         return -EFAULT;
 698         } else {
 699                 __wsum csum = 0;
 700                 if (csum_partial_copy_fromiovecend(to, iov, offset, len, &csum) < 0)
 701                         return -EFAULT;
 702                 skb->csum = csum_block_add(skb->csum, csum, odd);
 703         }
 704         return 0;
 705 }
 706
 707 static inline __wsum
 708 csum_page(struct page *page, int offset, int copy)
 709 {
 710         char *kaddr;
 711         __wsum csum;
 712         kaddr = kmap(page);
 713         csum = csum_partial(kaddr + offset, copy, 0);
 714         kunmap(page);
 715         return csum;
 716 }
 717
 718 static inline int ip_ufo_append_data(struct sock *sk,
 719                         int getfrag(void *from, char *to, int offset, int len,
 720                                int odd, struct sk_buff *skb),
 721                         void *from, int length, int hh_len, int fragheaderlen,
 722                         int transhdrlen, int mtu,unsigned int flags)
 723 {
 724         struct sk_buff *skb;
 725         int err;
 726
 727         /* There is support for UDP fragmentation offload by network
 728          * device, so create one single skb packet containing complete
 729          * udp datagram
 730          */
 731         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
 732                 skb = sock_alloc_send_skb(sk,
 733                         hh_len + fragheaderlen + transhdrlen + 20,
 734                         (flags & MSG_DONTWAIT), &err);
 735
 736                 if (skb == NULL)
 737                         return err;
 738
 739                 /* reserve space for Hardware header */
 740                 skb_reserve(skb, hh_len);
 741
 742                 /* create space for UDP/IP header */
 743                 skb_put(skb,fragheaderlen + transhdrlen);
 744
 745                 /* initialize network header pointer */
 746                 skb_reset_network_header(skb);
 747
 748                 /* initialize protocol header pointer */
 749                 skb->transport_header = skb->network_header + fragheaderlen;
 750
 751                 skb->ip_summed = CHECKSUM_PARTIAL;
 752                 skb->csum = 0;
 753                 sk->sk_sndmsg_off = 0;
 754         }
 755
 756         err = skb_append_datato_frags(sk,skb, getfrag, from,
 757                                (length - transhdrlen));
 758         if (!err) {
 759                 /* specify the length of each IP datagram fragment*/
 760                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
 761                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
 762                 __skb_queue_tail(&sk->sk_write_queue, skb);
 763
 764                 return 0;
 765         }
 766         /* There is not enough support do UFO ,
 767          * so follow normal path
 768          */
 769         kfree_skb(skb);
 770         return err;
 771 }
 772
 773 /*
 774  *      ip_append_data() and ip_append_page() can make one large IP datagram
 775  *      from many pieces of data. Each pieces will be holded on the socket
 776  *      until ip_push_pending_frames() is called. Each piece can be a page
 777  *      or non-page data.
 778  *
 779  *      Not only UDP, other transport protocols - e.g. raw sockets - can use
 780  *      this interface potentially.
 781  *
 782  *      LATER: length must be adjusted by pad at tail, when it is required.
 783  */
 784 int ip_append_data(struct sock *sk,
 785                    int getfrag(void *from, char *to, int offset, int len,
 786                                int odd, struct sk_buff *skb),
 787                    void *from, int length, int transhdrlen,
 788                    struct ipcm_cookie *ipc, struct rtable *rt,
 789                    unsigned int flags)
 790 {
 791         struct inet_sock *inet = inet_sk(sk);
 792         struct sk_buff *skb;
 793
 794         struct ip_options *opt = NULL;
 795         int hh_len;
 796         int exthdrlen;
 797         int mtu;
 798         int copy;
 799         int err;
 800         int offset = 0;
 801         unsigned int maxfraglen, fragheaderlen;
 802         int csummode = CHECKSUM_NONE;
 803
 804         if (flags&MSG_PROBE)
 805                 return 0;
 806
 807         if (skb_queue_empty(&sk->sk_write_queue)) {
 808                 /*
 809                  * setup for corking.
 810                  */
 811                 opt = ipc->opt;
 812                 if (opt) {
 813                         if (inet->cork.opt == NULL) {
 814                                 inet->cork.opt = kmalloc(sizeof(struct ip_options) + 40, sk->sk_allocation);
 815                                 if (unlikely(inet->cork.opt == NULL))
 816                                         return -ENOBUFS;
 817                         }
 818                         memcpy(inet->cork.opt, opt, sizeof(struct ip_options)+opt->optlen);
 819                         inet->cork.flags |= IPCORK_OPT;
 820                         inet->cork.addr = ipc->addr;
 821                 }
 822                 dst_hold(&rt->u.dst);
 823                 inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
 824                                             rt->u.dst.dev->mtu :
 825                                             dst_mtu(rt->u.dst.path);
 826                 inet->cork.rt = rt;
 827                 inet->cork.length = 0;
 828                 sk->sk_sndmsg_page = NULL;
 829                 sk->sk_sndmsg_off = 0;
 830                 if ((exthdrlen = rt->u.dst.header_len) != 0) {
 831                         length += exthdrlen;
 832                         transhdrlen += exthdrlen;
 833                 }
 834         } else {
 835                 rt = inet->cork.rt;
 836                 if (inet->cork.flags & IPCORK_OPT)
 837                         opt = inet->cork.opt;
 838
 839                 transhdrlen = 0;
 840                 exthdrlen = 0;
 841                 mtu = inet->cork.fragsize;
 842         }
 843         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 844
 845         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
 846         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 847
 848         if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 849                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
 850                 return -EMSGSIZE;
 851         }
 852
 853         /*
 854          * transhdrlen > 0 means that this is the first fragment and we wish
 855          * it won't be fragmented in the future.
 856          */
 857         if (transhdrlen &&
 858             length + fragheaderlen <= mtu &&
 859             rt->u.dst.dev->features & NETIF_F_V4_CSUM &&
 860             !exthdrlen)
 861                 csummode = CHECKSUM_PARTIAL;
 862
 863         inet->cork.length += length;
 864         if (((length > mtu) && (sk->sk_protocol == IPPROTO_UDP)) &&
 865                         (rt->u.dst.dev->features & NETIF_F_UFO)) {
 866
 867                 err = ip_ufo_append_data(sk, getfrag, from, length, hh_len,
 868                                          fragheaderlen, transhdrlen, mtu,
 869                                          flags);
 870                 if (err)
 871                         goto error;
 872                 return 0;
 873         }
 874
 875         /* So, what's going on in the loop below?
 876          *
 877          * We use calculated fragment length to generate chained skb,
 878          * each of segments is IP fragment ready for sending to network after
 879          * adding appropriate IP header.
 880          */
 881
 882         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 883                 goto alloc_new_skb;
 884
 885         while (length > 0) {
 886                 /* Check if the remaining data fits into current packet. */
 887                 copy = mtu - skb->len;
 888                 if (copy < length)
 889                         copy = maxfraglen - skb->len;
 890                 if (copy <= 0) {
 891                         char *data;
 892                         unsigned int datalen;
 893                         unsigned int fraglen;
 894                         unsigned int fraggap;
 895                         unsigned int alloclen;
 896                         struct sk_buff *skb_prev;
 897 alloc_new_skb:
 898                         skb_prev = skb;
 899                         if (skb_prev)
 900                                 fraggap = skb_prev->len - maxfraglen;
 901                         else
 902                                 fraggap = 0;
 903
 904                         /*
 905                          * If remaining data exceeds the mtu,
 906                          * we know we need more fragment(s).
 907                          */
 908                         datalen = length + fraggap;
 909                         if (datalen > mtu - fragheaderlen)
 910                                 datalen = maxfraglen - fragheaderlen;
 911                         fraglen = datalen + fragheaderlen;
 912
 913                         if ((flags & MSG_MORE) &&
 914                             !(rt->u.dst.dev->features&NETIF_F_SG))
 915                                 alloclen = mtu;
 916                         else
 917                                 alloclen = datalen + fragheaderlen;
 918
 919                         /* The last fragment gets additional space at tail.
 920                          * Note, with MSG_MORE we overallocate on fragments,
 921                          * because we have no idea what fragment will be
 922                          * the last.
 923                          */
 924                         if (datalen == length + fraggap)
 925                                 alloclen += rt->u.dst.trailer_len;
 926
 927                         if (transhdrlen) {
 928                                 skb = sock_alloc_send_skb(sk,
 929                                                 alloclen + hh_len + 15,
 930                                                 (flags & MSG_DONTWAIT), &err);
 931                         } else {
 932                                 skb = NULL;
 933                                 if (atomic_read(&sk->sk_wmem_alloc) <=
 934                                     2 * sk->sk_sndbuf)
 935                                         skb = sock_wmalloc(sk,
 936                                                            alloclen + hh_len + 15, 1,
 937                                                            sk->sk_allocation);
 938                                 if (unlikely(skb == NULL))
 939                                         err = -ENOBUFS;
 940                         }
 941                         if (skb == NULL)
 942                                 goto error;
 943
 944                         /*
 945                          *      Fill in the control structures
 946                          */
 947                         skb->ip_summed = csummode;
 948                         skb->csum = 0;
 949                         skb_reserve(skb, hh_len);
 950
 951                         /*
 952                          *      Find where to start putting bytes.
 953                          */
 954                         data = skb_put(skb, fraglen);
 955                         skb_set_network_header(skb, exthdrlen);
 956                         skb->transport_header = (skb->network_header +
 957                                                  fragheaderlen);
 958                         data += fragheaderlen;
 959
 960                         if (fraggap) {
 961                                 skb->csum = skb_copy_and_csum_bits(
 962                                         skb_prev, maxfraglen,
 963                                         data + transhdrlen, fraggap, 0);
 964                                 skb_prev->csum = csum_sub(skb_prev->csum,
 965                                                           skb->csum);
 966                                 data += fraggap;
 967                                 pskb_trim_unique(skb_prev, maxfraglen);
 968                         }
 969
 970                         copy = datalen - transhdrlen - fraggap;
 971                         if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
 972                                 err = -EFAULT;
 973                                 kfree_skb(skb);
 974                                 goto error;
 975                         }
 976
 977                         offset += copy;
 978                         length -= datalen - fraggap;
 979                         transhdrlen = 0;
 980                         exthdrlen = 0;
 981                         csummode = CHECKSUM_NONE;
 982
 983                         /*
 984                          * Put the packet on the pending queue.
 985                          */
 986                         __skb_queue_tail(&sk->sk_write_queue, skb);
 987                         continue;
 988                 }
 989
 990                 if (copy > length)
 991                         copy = length;
 992
 993                 if (!(rt->u.dst.dev->features&NETIF_F_SG)) {
 994                         unsigned int off;
 995
 996                         off = skb->len;
 997                         if (getfrag(from, skb_put(skb, copy),
 998                                         offset, copy, off, skb) < 0) {
 999                                 __skb_trim(skb, off);
1000                                 err = -EFAULT;
1001                                 goto error;
1002                         }
1003                 } else {
1004                         int i = skb_shinfo(skb)->nr_frags;
1005                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1006                         struct page *page = sk->sk_sndmsg_page;
1007                         int off = sk->sk_sndmsg_off;
1008                         unsigned int left;
1009
1010                         if (page && (left = PAGE_SIZE - off) > 0) {
1011                                 if (copy >= left)
1012                                         copy = left;
1013                                 if (page != frag->page) {
1014                                         if (i == MAX_SKB_FRAGS) {
1015                                                 err = -EMSGSIZE;
1016                                                 goto error;
1017                                         }
1018                                         get_page(page);
1019                                         skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1020                                         frag = &skb_shinfo(skb)->frags[i];
1021                                 }
1022                         } else if (i < MAX_SKB_FRAGS) {
1023                                 if (copy > PAGE_SIZE)
1024                                         copy = PAGE_SIZE;
1025                                 page = alloc_pages(sk->sk_allocation, 0);
1026                                 if (page == NULL)  {
1027                                         err = -ENOMEM;
1028                                         goto error;
1029                                 }
1030                                 sk->sk_sndmsg_page = page;
1031                                 sk->sk_sndmsg_off = 0;
1032
1033                                 skb_fill_page_desc(skb, i, page, 0, 0);
1034                                 frag = &skb_shinfo(skb)->frags[i];
1035                         } else {
1036                                 err = -EMSGSIZE;
1037                                 goto error;
1038                         }
1039                         if (getfrag(from, page_address(frag->page)+frag->page_offset+frag->size, offset, copy, skb->len, skb) < 0) {
1040                                 err = -EFAULT;
1041                                 goto error;
1042                         }
1043                         sk->sk_sndmsg_off += copy;
1044                         frag->size += copy;
1045                         skb->len += copy;
1046                         skb->data_len += copy;
1047                         skb->truesize += copy;
1048                         atomic_add(copy, &sk->sk_wmem_alloc);
1049                 }
1050                 offset += copy;
1051                 length -= copy;
1052         }
1053
1054         return 0;
1055
1056 error:
1057         inet->cork.length -= length;
1058         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1059         return err;
1060 }
1061
1062 ssize_t ip_append_page(struct sock *sk, struct page *page,
1063                        int offset, size_t size, int flags)
1064 {
1065         struct inet_sock *inet = inet_sk(sk);
1066         struct sk_buff *skb;
1067         struct rtable *rt;
1068         struct ip_options *opt = NULL;
1069         int hh_len;
1070         int mtu;
1071         int len;
1072         int err;
1073         unsigned int maxfraglen, fragheaderlen, fraggap;
1074
1075         if (inet->hdrincl)
1076                 return -EPERM;
1077
1078         if (flags&MSG_PROBE)
1079                 return 0;
1080
1081         if (skb_queue_empty(&sk->sk_write_queue))
1082                 return -EINVAL;
1083
1084         rt = inet->cork.rt;
1085         if (inet->cork.flags & IPCORK_OPT)
1086                 opt = inet->cork.opt;
1087
1088         if (!(rt->u.dst.dev->features&NETIF_F_SG))
1089                 return -EOPNOTSUPP;
1090
1091         hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
1092         mtu = inet->cork.fragsize;
1093
1094         fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
1095         maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
1096
1097         if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
1098                 ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
1099                 return -EMSGSIZE;
1100         }
1101
1102         if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1103                 return -EINVAL;
1104
1105         inet->cork.length += size;
1106         if ((sk->sk_protocol == IPPROTO_UDP) &&
1107             (rt->u.dst.dev->features & NETIF_F_UFO)) {
1108                 skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
1109                 skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1110         }
1111
1112
1113         while (size > 0) {
1114                 int i;
1115
1116                 if (skb_is_gso(skb))
1117                         len = size;
1118                 else {
1119
1120                         /* Check if the remaining data fits into current packet. */
1121                         len = mtu - skb->len;
1122                         if (len < size)
1123                                 len = maxfraglen - skb->len;
1124                 }
1125                 if (len <= 0) {
1126                         struct sk_buff *skb_prev;
1127                         int alloclen;
1128
1129                         skb_prev = skb;
1130                         fraggap = skb_prev->len - maxfraglen;
1131
1132                         alloclen = fragheaderlen + hh_len + fraggap + 15;
1133                         skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
1134                         if (unlikely(!skb)) {
1135                                 err = -ENOBUFS;
1136                                 goto error;
1137                         }
1138
1139                         /*
1140                          *      Fill in the control structures
1141                          */
1142                         skb->ip_summed = CHECKSUM_NONE;
1143                         skb->csum = 0;
1144                         skb_reserve(skb, hh_len);
1145
1146                         /*
1147                          *      Find where to start putting bytes.
1148                          */
1149                         skb_put(skb, fragheaderlen + fraggap);
1150                         skb_reset_network_header(skb);
1151                         skb->transport_header = (skb->network_header +
1152                                                  fragheaderlen);
1153                         if (fraggap) {
1154                                 skb->csum = skb_copy_and_csum_bits(skb_prev,
1155                                                                    maxfraglen,
1156                                                     skb_transport_header(skb),
1157                                                                    fraggap, 0);
1158                                 skb_prev->csum = csum_sub(skb_prev->csum,
1159                                                           skb->csum);
1160                                 pskb_trim_unique(skb_prev, maxfraglen);
1161                         }
1162
1163                         /*
1164                          * Put the packet on the pending queue.
1165                          */
1166                         __skb_queue_tail(&sk->sk_write_queue, skb);
1167                         continue;
1168                 }
1169
1170                 i = skb_shinfo(skb)->nr_frags;
1171                 if (len > size)
1172                         len = size;
1173                 if (skb_can_coalesce(skb, i, page, offset)) {
1174                         skb_shinfo(skb)->frags[i-1].size += len;
1175                 } else if (i < MAX_SKB_FRAGS) {
1176                         get_page(page);
1177                         skb_fill_page_desc(skb, i, page, offset, len);
1178                 } else {
1179                         err = -EMSGSIZE;
1180                         goto error;
1181                 }
1182
1183                 if (skb->ip_summed == CHECKSUM_NONE) {
1184                         __wsum csum;
1185                         csum = csum_page(page, offset, len);
1186                         skb->csum = csum_block_add(skb->csum, csum, skb->len);
1187                 }
1188
1189                 skb->len += len;
1190                 skb->data_len += len;
1191                 skb->truesize += len;
1192                 atomic_add(len, &sk->sk_wmem_alloc);
1193                 offset += len;
1194                 size -= len;
1195         }
1196         return 0;
1197
1198 error:
1199         inet->cork.length -= size;
1200         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1201         return err;
1202 }
1203
1204 static void ip_cork_release(struct inet_sock *inet)
1205 {
1206         inet->cork.flags &= ~IPCORK_OPT;
1207         kfree(inet->cork.opt);
1208         inet->cork.opt = NULL;
1209         if (inet->cork.rt) {
1210                 ip_rt_put(inet->cork.rt);
1211                 inet->cork.rt = NULL;
1212         }
1213 }
1214
1215 /*
1216  *      Combined all pending IP fragments on the socket as one IP datagram
1217  *      and push them out.
1218  */
1219 int ip_push_pending_frames(struct sock *sk)
1220 {
1221         struct sk_buff *skb, *tmp_skb;
1222         struct sk_buff **tail_skb;
1223         struct inet_sock *inet = inet_sk(sk);
1224         struct ip_options *opt = NULL;
1225         struct rtable *rt = inet->cork.rt;
1226         struct iphdr *iph;
1227         __be16 df = 0;
1228         __u8 ttl;
1229         int err = 0;
1230
1231         if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1232                 goto out;
1233         tail_skb = &(skb_shinfo(skb)->frag_list);
1234
1235         /* move skb->data to ip header from ext header */
1236         if (skb->data < skb_network_header(skb))
1237                 __skb_pull(skb, skb_network_offset(skb));
1238         while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1239                 __skb_pull(tmp_skb, skb_network_header_len(skb));
1240                 *tail_skb = tmp_skb;
1241                 tail_skb = &(tmp_skb->next);
1242                 skb->len += tmp_skb->len;
1243                 skb->data_len += tmp_skb->len;
1244                 skb->truesize += tmp_skb->truesize;
1245                 __sock_put(tmp_skb->sk);
1246                 tmp_skb->destructor = NULL;
1247                 tmp_skb->sk = NULL;
1248         }
1249
1250         /* Unless user demanded real pmtu discovery (IP_PMTUDISC_DO), we allow
1251          * to fragment the frame generated here. No matter, what transforms
1252          * how transforms change size of the packet, it will come out.
1253          */
1254         if (inet->pmtudisc < IP_PMTUDISC_DO)
1255                 skb->local_df = 1;
1256
1257         /* DF bit is set when we want to see DF on outgoing frames.
1258          * If local_df is set too, we still allow to fragment this frame
1259          * locally. */
1260         if (inet->pmtudisc >= IP_PMTUDISC_DO ||
1261             (skb->len <= dst_mtu(&rt->u.dst) &&
1262              ip_dont_fragment(sk, &rt->u.dst)))
1263                 df = htons(IP_DF);
1264
1265         if (inet->cork.flags & IPCORK_OPT)
1266                 opt = inet->cork.opt;
1267
1268         if (rt->rt_type == RTN_MULTICAST)
1269                 ttl = inet->mc_ttl;
1270         else
1271                 ttl = ip_select_ttl(inet, &rt->u.dst);
1272
1273         iph = (struct iphdr *)skb->data;
1274         iph->version = 4;
1275         iph->ihl = 5;
1276         if (opt) {
1277                 iph->ihl += opt->optlen>>2;
1278                 ip_options_build(skb, opt, inet->cork.addr, rt, 0);
1279         }
1280         iph->tos = inet->tos;
1281         iph->frag_off = df;
1282         ip_select_ident(iph, &rt->u.dst, sk);
1283         iph->ttl = ttl;
1284         iph->protocol = sk->sk_protocol;
1285         iph->saddr = rt->rt_src;
1286         iph->daddr = rt->rt_dst;
1287
1288         skb->priority = sk->sk_priority;
1289         skb->dst = dst_clone(&rt->u.dst);
1290
1291         if (iph->protocol == IPPROTO_ICMP)
1292                 icmp_out_count(((struct icmphdr *)
1293                         skb_transport_header(skb))->type);
1294
1295         /* Netfilter gets whole the not fragmented skb. */
1296         err = ip_local_out(skb);
1297         if (err) {
1298                 if (err > 0)
1299                         err = inet->recverr ? net_xmit_errno(err) : 0;
1300                 if (err)
1301                         goto error;
1302         }
1303
1304 out:
1305         ip_cork_release(inet);
1306         return err;
1307
1308 error:
1309         IP_INC_STATS(IPSTATS_MIB_OUTDISCARDS);
1310         goto out;
1311 }
1312
1313 /*
1314  *      Throw away all pending data on the socket.
1315  */
1316 void ip_flush_pending_frames(struct sock *sk)
1317 {
1318         struct sk_buff *skb;
1319
1320         while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL)
1321                 kfree_skb(skb);
1322
1323         ip_cork_release(inet_sk(sk));
1324 }
1325
1326
1327 /*
1328  *      Fetch data from kernel space and fill in checksum if needed.
1329  */
1330 static int ip_reply_glue_bits(void *dptr, char *to, int offset,
1331                               int len, int odd, struct sk_buff *skb)
1332 {
1333         __wsum csum;
1334
1335         csum = csum_partial_copy_nocheck(dptr+offset, to, len, 0);
1336         skb->csum = csum_block_add(skb->csum, csum, odd);
1337         return 0;
1338 }
1339
1340 /*
1341  *      Generic function to send a packet as reply to another packet.
1342  *      Used to send TCP resets so far. ICMP should use this function too.
1343  *
1344  *      Should run single threaded per socket because it uses the sock
1345  *      structure to pass arguments.
1346  */
1347 void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *arg,
1348                    unsigned int len)
1349 {
1350         struct inet_sock *inet = inet_sk(sk);
1351         struct {
1352                 struct ip_options       opt;
1353                 char                    data[40];
1354         } replyopts;
1355         struct ipcm_cookie ipc;
1356         __be32 daddr;
1357         struct rtable *rt = (struct rtable*)skb->dst;
1358
1359         if (ip_options_echo(&replyopts.opt, skb))
1360                 return;
1361
1362         daddr = ipc.addr = rt->rt_src;
1363         ipc.opt = NULL;
1364
1365         if (replyopts.opt.optlen) {
1366                 ipc.opt = &replyopts.opt;
1367
1368                 if (ipc.opt->srr)
1369                         daddr = replyopts.opt.faddr;
1370         }
1371
1372         {
1373                 struct flowi fl = { .oif = arg->bound_dev_if,
1374                                     .nl_u = { .ip4_u =
1375                                               { .daddr = daddr,
1376                                                 .saddr = rt->rt_spec_dst,
1377                                                 .tos = RT_TOS(ip_hdr(skb)->tos) } },
1378                                     /* Not quite clean, but right. */
1379                                     .uli_u = { .ports =
1380                                                { .sport = tcp_hdr(skb)->dest,
1381                                                  .dport = tcp_hdr(skb)->source } },
1382                                     .proto = sk->sk_protocol };
1383                 security_skb_classify_flow(skb, &fl);
1384                 if (ip_route_output_key(sk->sk_net, &rt, &fl))
1385                         return;
1386         }
1387
1388         /* And let IP do all the hard work.
1389
1390            This chunk is not reenterable, hence spinlock.
1391            Note that it uses the fact, that this function is called
1392            with locally disabled BH and that sk cannot be already spinlocked.
1393          */
1394         bh_lock_sock(sk);
1395         inet->tos = ip_hdr(skb)->tos;
1396         sk->sk_priority = skb->priority;
1397         sk->sk_protocol = ip_hdr(skb)->protocol;
1398         sk->sk_bound_dev_if = arg->bound_dev_if;
1399         ip_append_data(sk, ip_reply_glue_bits, arg->iov->iov_base, len, 0,
1400                        &ipc, rt, MSG_DONTWAIT);
1401         if ((skb = skb_peek(&sk->sk_write_queue)) != NULL) {
1402                 if (arg->csumoffset >= 0)
1403                         *((__sum16 *)skb_transport_header(skb) +
1404                           arg->csumoffset) = csum_fold(csum_add(skb->csum,
1405                                                                 arg->csum));
1406                 skb->ip_summed = CHECKSUM_NONE;
1407                 ip_push_pending_frames(sk);
1408         }
1409
1410         bh_unlock_sock(sk);
1411
1412         ip_rt_put(rt);
1413 }
1414
1415 void __init ip_init(void)
1416 {
1417         ip_rt_init();
1418         inet_initpeers();
1419
1420 #if defined(CONFIG_IP_MULTICAST) && defined(CONFIG_PROC_FS)
1421         igmp_mc_proc_init();
1422 #endif
1423 }
1424
1425 EXPORT_SYMBOL(ip_generic_getfrag);
1426 EXPORT_SYMBOL(ip_queue_xmit);
1427 EXPORT_SYMBOL(ip_send_check);