]> bbs.cooldavid.org Git - net-next-2.6.git/blobdiff - net/ipv4/tcp_input.c
net: use get/put_unaligned_* helpers
[net-next-2.6.git] / net / ipv4 / tcp_input.c
index 7facdb0f69608be661e409c2ead641f5d0fb1bc8..eda4f4a233f36e940fe81cf83188dcbc2817e0fa 100644 (file)
@@ -1172,8 +1172,8 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,
                           struct tcp_sack_block_wire *sp, int num_sacks,
                           u32 prior_snd_una)
 {
-       u32 start_seq_0 = ntohl(get_unaligned(&sp[0].start_seq));
-       u32 end_seq_0 = ntohl(get_unaligned(&sp[0].end_seq));
+       u32 start_seq_0 = get_unaligned_be32(&sp[0].start_seq);
+       u32 end_seq_0 = get_unaligned_be32(&sp[0].end_seq);
        int dup_sack = 0;
 
        if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) {
@@ -1181,8 +1181,8 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,
                tcp_dsack_seen(tp);
                NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
        } else if (num_sacks > 1) {
-               u32 end_seq_1 = ntohl(get_unaligned(&sp[1].end_seq));
-               u32 start_seq_1 = ntohl(get_unaligned(&sp[1].start_seq));
+               u32 end_seq_1 = get_unaligned_be32(&sp[1].end_seq);
+               u32 start_seq_1 = get_unaligned_be32(&sp[1].start_seq);
 
                if (!after(end_seq_0, end_seq_1) &&
                    !before(start_seq_0, start_seq_1)) {
@@ -1453,8 +1453,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb,
        for (i = 0; i < num_sacks; i++) {
                int dup_sack = !i && found_dup_sack;
 
-               sp[used_sacks].start_seq = ntohl(get_unaligned(&sp_wire[i].start_seq));
-               sp[used_sacks].end_seq = ntohl(get_unaligned(&sp_wire[i].end_seq));
+               sp[used_sacks].start_seq = get_unaligned_be32(&sp_wire[i].start_seq);
+               sp[used_sacks].end_seq = get_unaligned_be32(&sp_wire[i].end_seq);
 
                if (!tcp_is_sackblock_valid(tp, dup_sack,
                                            sp[used_sacks].start_seq,
@@ -1625,13 +1625,11 @@ out:
        return flag;
 }
 
-/* If we receive more dupacks than we expected counting segments
- * in assumption of absent reordering, interpret this as reordering.
- * The only another reason could be bug in receiver TCP.
+/* Limits sacked_out so that sum with lost_out isn't ever larger than
+ * packets_out. Returns zero if sacked_out adjustement wasn't necessary.
  */
-static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+int tcp_limit_reno_sacked(struct tcp_sock *tp)
 {
-       struct tcp_sock *tp = tcp_sk(sk);
        u32 holes;
 
        holes = max(tp->lost_out, 1U);
@@ -1639,8 +1637,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
 
        if ((tp->sacked_out + holes) > tp->packets_out) {
                tp->sacked_out = tp->packets_out - holes;
-               tcp_update_reordering(sk, tp->packets_out + addend, 0);
+               return 1;
        }
+       return 0;
+}
+
+/* If we receive more dupacks than we expected counting segments
+ * in assumption of absent reordering, interpret this as reordering.
+ * The only another reason could be bug in receiver TCP.
+ */
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       if (tcp_limit_reno_sacked(tp))
+               tcp_update_reordering(sk, tp->packets_out + addend, 0);
 }
 
 /* Emulate SACKs for SACKless connection: account for a new dupack. */
@@ -1681,11 +1691,16 @@ static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
 int tcp_use_frto(struct sock *sk)
 {
        const struct tcp_sock *tp = tcp_sk(sk);
+       const struct inet_connection_sock *icsk = inet_csk(sk);
        struct sk_buff *skb;
 
        if (!sysctl_tcp_frto)
                return 0;
 
+       /* MTU probe and F-RTO won't really play nicely along currently */
+       if (icsk->icsk_mtup.probe_size)
+               return 0;
+
        if (IsSackFrto())
                return 1;
 
@@ -2134,11 +2149,13 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb)
 /* Mark head of queue up as lost. With RFC3517 SACK, the packets is
  * is against sacked "cnt", otherwise it's against facked "cnt"
  */
-static void tcp_mark_head_lost(struct sock *sk, int packets, int fast_rexmit)
+static void tcp_mark_head_lost(struct sock *sk, int packets)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
-       int cnt;
+       int cnt, oldcnt;
+       int err;
+       unsigned int mss;
 
        BUG_TRAP(packets <= tp->packets_out);
        if (tp->lost_skb_hint) {
@@ -2157,13 +2174,25 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int fast_rexmit)
                tp->lost_skb_hint = skb;
                tp->lost_cnt_hint = cnt;
 
+               if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
+                       break;
+
+               oldcnt = cnt;
                if (tcp_is_fack(tp) || tcp_is_reno(tp) ||
                    (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
                        cnt += tcp_skb_pcount(skb);
 
-               if (((!fast_rexmit || (tp->lost_out > 0)) && (cnt > packets)) ||
-                   after(TCP_SKB_CB(skb)->end_seq, tp->high_seq))
-                       break;
+               if (cnt > packets) {
+                       if (tcp_is_sack(tp) || (oldcnt >= packets))
+                               break;
+
+                       mss = skb_shinfo(skb)->gso_size;
+                       err = tcp_fragment(sk, skb, (packets - oldcnt) * mss, mss);
+                       if (err < 0)
+                               break;
+                       cnt = packets;
+               }
+
                if (!(TCP_SKB_CB(skb)->sacked & (TCPCB_SACKED_ACKED|TCPCB_LOST))) {
                        TCP_SKB_CB(skb)->sacked |= TCPCB_LOST;
                        tp->lost_out += tcp_skb_pcount(skb);
@@ -2180,17 +2209,17 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit)
        struct tcp_sock *tp = tcp_sk(sk);
 
        if (tcp_is_reno(tp)) {
-               tcp_mark_head_lost(sk, 1, fast_rexmit);
+               tcp_mark_head_lost(sk, 1);
        } else if (tcp_is_fack(tp)) {
                int lost = tp->fackets_out - tp->reordering;
                if (lost <= 0)
                        lost = 1;
-               tcp_mark_head_lost(sk, lost, fast_rexmit);
+               tcp_mark_head_lost(sk, lost);
        } else {
                int sacked_upto = tp->sacked_out - tp->reordering;
-               if (sacked_upto < 0)
-                       sacked_upto = 0;
-               tcp_mark_head_lost(sk, sacked_upto, fast_rexmit);
+               if (sacked_upto < fast_rexmit)
+                       sacked_upto = fast_rexmit;
+               tcp_mark_head_lost(sk, sacked_upto);
        }
 
        /* New heuristics: it is possible only after we switched
@@ -2269,7 +2298,7 @@ static inline int tcp_packet_delayed(struct tcp_sock *tp)
 {
        return !tp->retrans_stamp ||
                (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr &&
-                (__s32)(tp->rx_opt.rcv_tsecr - tp->retrans_stamp) < 0);
+                before(tp->rx_opt.rcv_tsecr, tp->retrans_stamp));
 }
 
 /* Undo procedures. */
@@ -2280,12 +2309,25 @@ static void DBGUNDO(struct sock *sk, const char *msg)
        struct tcp_sock *tp = tcp_sk(sk);
        struct inet_sock *inet = inet_sk(sk);
 
-       printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n",
-              msg,
-              NIPQUAD(inet->daddr), ntohs(inet->dport),
-              tp->snd_cwnd, tcp_left_out(tp),
-              tp->snd_ssthresh, tp->prior_ssthresh,
-              tp->packets_out);
+       if (sk->sk_family == AF_INET) {
+               printk(KERN_DEBUG "Undo %s " NIPQUAD_FMT "/%u c%u l%u ss%u/%u p%u\n",
+                      msg,
+                      NIPQUAD(inet->daddr), ntohs(inet->dport),
+                      tp->snd_cwnd, tcp_left_out(tp),
+                      tp->snd_ssthresh, tp->prior_ssthresh,
+                      tp->packets_out);
+       }
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+       else if (sk->sk_family == AF_INET6) {
+               struct ipv6_pinfo *np = inet6_sk(sk);
+               printk(KERN_DEBUG "Undo %s " NIP6_FMT "/%u c%u l%u ss%u/%u p%u\n",
+                      msg,
+                      NIP6(np->daddr), ntohs(inet->dport),
+                      tp->snd_cwnd, tcp_left_out(tp),
+                      tp->snd_ssthresh, tp->prior_ssthresh,
+                      tp->packets_out);
+       }
+#endif
 }
 #else
 #define DBGUNDO(x...) do { } while (0)
@@ -2524,7 +2566,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
            before(tp->snd_una, tp->high_seq) &&
            icsk->icsk_ca_state != TCP_CA_Open &&
            tp->fackets_out > tp->reordering) {
-               tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0);
+               tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering);
                NET_INC_STATS_BH(LINUX_MIB_TCPLOSS);
        }
 
@@ -2586,6 +2628,8 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
        case TCP_CA_Loss:
                if (flag & FLAG_DATA_ACKED)
                        icsk->icsk_retransmits = 0;
+               if (tcp_is_reno(tp) && flag & FLAG_SND_UNA_ADVANCED)
+                       tcp_reset_reno_sack(tp);
                if (!tcp_try_undo_loss(sk)) {
                        tcp_moderate_cwnd(tp);
                        tcp_xmit_retransmit_queue(sk);
@@ -3296,7 +3340,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
                        switch (opcode) {
                        case TCPOPT_MSS:
                                if (opsize == TCPOLEN_MSS && th->syn && !estab) {
-                                       u16 in_mss = ntohs(get_unaligned((__be16 *)ptr));
+                                       u16 in_mss = get_unaligned_be16(ptr);
                                        if (in_mss) {
                                                if (opt_rx->user_mss &&
                                                    opt_rx->user_mss < in_mss)
@@ -3325,8 +3369,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
                                    ((estab && opt_rx->tstamp_ok) ||
                                     (!estab && sysctl_tcp_timestamps))) {
                                        opt_rx->saw_tstamp = 1;
-                                       opt_rx->rcv_tsval = ntohl(get_unaligned((__be32 *)ptr));
-                                       opt_rx->rcv_tsecr = ntohl(get_unaligned((__be32 *)(ptr+4)));
+                                       opt_rx->rcv_tsval = get_unaligned_be32(ptr);
+                                       opt_rx->rcv_tsecr = get_unaligned_be32(ptr + 4);
                                }
                                break;
                        case TCPOPT_SACK_PERM:
@@ -3561,7 +3605,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th)
                 * cases we should never reach this piece of code.
                 */
                printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n",
-                      __FUNCTION__, sk->sk_state);
+                      __func__, sk->sk_state);
                break;
        }
 
@@ -3810,8 +3854,28 @@ static void tcp_ofo_queue(struct sock *sk)
        }
 }
 
+static int tcp_prune_ofo_queue(struct sock *sk);
 static int tcp_prune_queue(struct sock *sk);
 
+static inline int tcp_try_rmem_schedule(struct sock *sk, unsigned int size)
+{
+       if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
+           !sk_rmem_schedule(sk, size)) {
+
+               if (tcp_prune_queue(sk) < 0)
+                       return -1;
+
+               if (!sk_rmem_schedule(sk, size)) {
+                       if (!tcp_prune_ofo_queue(sk))
+                               return -1;
+
+                       if (!sk_rmem_schedule(sk, size))
+                               return -1;
+               }
+       }
+       return 0;
+}
+
 static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
 {
        struct tcphdr *th = tcp_hdr(skb);
@@ -3861,12 +3925,9 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
                if (eaten <= 0) {
 queue_and_out:
                        if (eaten < 0 &&
-                           (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-                            !sk_rmem_schedule(sk, skb->truesize))) {
-                               if (tcp_prune_queue(sk) < 0 ||
-                                   !sk_rmem_schedule(sk, skb->truesize))
-                                       goto drop;
-                       }
+                           tcp_try_rmem_schedule(sk, skb->truesize))
+                               goto drop;
+
                        skb_set_owner_r(skb, sk);
                        __skb_queue_tail(&sk->sk_receive_queue, skb);
                }
@@ -3935,12 +3996,8 @@ drop:
 
        TCP_ECN_check_ce(tp, skb);
 
-       if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
-           !sk_rmem_schedule(sk, skb->truesize)) {
-               if (tcp_prune_queue(sk) < 0 ||
-                   !sk_rmem_schedule(sk, skb->truesize))
-                       goto drop;
-       }
+       if (tcp_try_rmem_schedule(sk, skb->truesize))
+               goto drop;
 
        /* Disable header prediction. */
        tp->pred_flags = 0;
@@ -3968,7 +4025,7 @@ drop:
                u32 end_seq = TCP_SKB_CB(skb)->end_seq;
 
                if (seq == TCP_SKB_CB(skb1)->end_seq) {
-                       __skb_append(skb1, skb, &tp->out_of_order_queue);
+                       __skb_queue_after(&tp->out_of_order_queue, skb1, skb);
 
                        if (!tp->rx_opt.num_sacks ||
                            tp->selective_acks[0].end_seq != seq)
@@ -4167,6 +4224,32 @@ static void tcp_collapse_ofo_queue(struct sock *sk)
        }
 }
 
+/*
+ * Purge the out-of-order queue.
+ * Return true if queue was pruned.
+ */
+static int tcp_prune_ofo_queue(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       int res = 0;
+
+       if (!skb_queue_empty(&tp->out_of_order_queue)) {
+               NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
+               __skb_queue_purge(&tp->out_of_order_queue);
+
+               /* Reset SACK state.  A conforming SACK implementation will
+                * do the same at a timeout based retransmit.  When a connection
+                * is in a sad state like this, we care only about integrity
+                * of the connection not performance.
+                */
+               if (tp->rx_opt.sack_ok)
+                       tcp_sack_reset(&tp->rx_opt);
+               sk_mem_reclaim(sk);
+               res = 1;
+       }
+       return res;
+}
+
 /* Reduce allocated memory if we can, trying to get
  * the socket within its memory limits again.
  *
@@ -4200,20 +4283,7 @@ static int tcp_prune_queue(struct sock *sk)
        /* Collapsing did not help, destructive actions follow.
         * This must not ever occur. */
 
-       /* First, purge the out_of_order queue. */
-       if (!skb_queue_empty(&tp->out_of_order_queue)) {
-               NET_INC_STATS_BH(LINUX_MIB_OFOPRUNED);
-               __skb_queue_purge(&tp->out_of_order_queue);
-
-               /* Reset SACK state.  A conforming SACK implementation will
-                * do the same at a timeout based retransmit.  When a connection
-                * is in a sad state like this, we care only about integrity
-                * of the connection not performance.
-                */
-               if (tcp_is_sack(tp))
-                       tcp_sack_reset(&tp->rx_opt);
-               sk_mem_reclaim(sk);
-       }
+       tcp_prune_ofo_queue(sk);
 
        if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
                return 0;
@@ -4451,6 +4521,49 @@ static void tcp_urg(struct sock *sk, struct sk_buff *skb, struct tcphdr *th)
        }
 }
 
+static int tcp_defer_accept_check(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+
+       if (tp->defer_tcp_accept.request) {
+               int queued_data =  tp->rcv_nxt - tp->copied_seq;
+               int hasfin =  !skb_queue_empty(&sk->sk_receive_queue) ?
+                       tcp_hdr((struct sk_buff *)
+                               sk->sk_receive_queue.prev)->fin : 0;
+
+               if (queued_data && hasfin)
+                       queued_data--;
+
+               if (queued_data &&
+                   tp->defer_tcp_accept.listen_sk->sk_state == TCP_LISTEN) {
+                       if (sock_flag(sk, SOCK_KEEPOPEN)) {
+                               inet_csk_reset_keepalive_timer(sk,
+                                                              keepalive_time_when(tp));
+                       } else {
+                               inet_csk_delete_keepalive_timer(sk);
+                       }
+
+                       inet_csk_reqsk_queue_add(
+                               tp->defer_tcp_accept.listen_sk,
+                               tp->defer_tcp_accept.request,
+                               sk);
+
+                       tp->defer_tcp_accept.listen_sk->sk_data_ready(
+                               tp->defer_tcp_accept.listen_sk, 0);
+
+                       sock_put(tp->defer_tcp_accept.listen_sk);
+                       sock_put(sk);
+                       tp->defer_tcp_accept.listen_sk = NULL;
+                       tp->defer_tcp_accept.request = NULL;
+               } else if (hasfin ||
+                          tp->defer_tcp_accept.listen_sk->sk_state != TCP_LISTEN) {
+                       tcp_reset(sk);
+                       return -1;
+               }
+       }
+       return 0;
+}
+
 static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -4811,6 +4924,8 @@ step5:
 
        tcp_data_snd_check(sk);
        tcp_ack_snd_check(sk);
+
+       tcp_defer_accept_check(sk);
        return 0;
 
 csum_error:
@@ -5330,6 +5445,7 @@ discard:
 
 EXPORT_SYMBOL(sysctl_tcp_ecn);
 EXPORT_SYMBOL(sysctl_tcp_reordering);
+EXPORT_SYMBOL(sysctl_tcp_adv_win_scale);
 EXPORT_SYMBOL(tcp_parse_options);
 EXPORT_SYMBOL(tcp_rcv_established);
 EXPORT_SYMBOL(tcp_rcv_state_process);