[NET]: Convert xtime.tv_sec to get_seconds()

[net-next-2.6.git] / net / ipv4 / tcp_input.c
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index a283fc12186eb36141f8abc0ffc359e4483a44d4..d0a3630f41a7dc7e1d703c2b68babfd5e307e52f 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -86,6 +86,7 @@ int sysctl_tcp_stdurg __read_mostly;
  int sysctl_tcp_rfc1337 __read_mostly;
  int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
  int sysctl_tcp_frto __read_mostly;
+int sysctl_tcp_frto_response __read_mostly;
  int sysctl_tcp_nometrics_save __read_mostly;
  
  int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
@@ -100,6 +101,7 @@ int sysctl_tcp_abc __read_mostly;
  #define FLAG_ECE               0x40 /* ECE in this ACK                         */
  #define FLAG_DATA_LOST         0x80 /* SACK detected data lossage.             */
  #define FLAG_SLOWPATH          0x100 /* Do not skip RFC checks for window update.*/
+#define FLAG_ONLY_ORIG_SACKED  0x200 /* SACKs only non-rexmit sent before RTO */
  
  #define FLAG_ACKED             (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
  #define FLAG_NOT_DUP           (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -110,6 +112,8 @@ int sysctl_tcp_abc __read_mostly;
  #define IsFack(tp) ((tp)->rx_opt.sack_ok & 2)
  #define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4)
  
+#define IsSackFrto() (sysctl_tcp_frto == 0x2)
+
  #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
  
  /* Adapt the MSS value used to make delayed ack decision to the
@@ -759,15 +763,17 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
  }
  
  /* Set slow start threshold and cwnd not falling to slow start */
-void tcp_enter_cwr(struct sock *sk)
+void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
  {
         struct tcp_sock *tp = tcp_sk(sk);
+       const struct inet_connection_sock *icsk = inet_csk(sk);
  
         tp->prior_ssthresh = 0;
         tp->bytes_acked = 0;
-       if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+       if (icsk->icsk_ca_state < TCP_CA_CWR) {
                 tp->undo_marker = 0;
-               tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+               if (set_ssthresh)
+                       tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
                 tp->snd_cwnd = min(tp->snd_cwnd,
                                    tcp_packets_in_flight(tp) + 1U);
                 tp->snd_cwnd_cnt = 0;
@@ -1159,6 +1165,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
                                                 /* clear lost hint */
                                                 tp->retransmit_skb_hint = NULL;
                                         }
+                                       /* SACK enhanced F-RTO detection.
+                                        * Set flag if and only if non-rexmitted
+                                        * segments below frto_highmark are
+                                        * SACKed (RFC4138; Appendix B).
+                                        * Clearing correct due to in-order walk
+                                        */
+                                       if (after(end_seq, tp->frto_highmark)) {
+                                               flag &= ~FLAG_ONLY_ORIG_SACKED;
+                                       } else {
+                                               if (!(sacked & TCPCB_RETRANS))
+                                                       flag |= FLAG_ONLY_ORIG_SACKED;
+                                       }
                                 }
  
                                 TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
@@ -1224,7 +1242,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
  
         tp->left_out = tp->sacked_out + tp->lost_out;
  
-       if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss)
+       if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss &&
+           (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
                 tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
  
  #if FASTRETRANS_DEBUG > 0
@@ -1239,7 +1258,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
  /* F-RTO can only be used if these conditions are satisfied:
   *  - there must be some unsent new data
   *  - the advertised window should allow sending it
- *  - TCP has never retransmitted anything other than head
+ *  - TCP has never retransmitted anything other than head (SACK enhanced
+ *    variant from Appendix B of RFC4138 is more robust here)
   */
  int tcp_use_frto(struct sock *sk)
  {
@@ -1251,6 +1271,9 @@ int tcp_use_frto(struct sock *sk)
                       tp->snd_una + tp->snd_wnd))
                 return 0;
  
+       if (IsSackFrto())
+               return 1;
+
         /* Avoid expensive walking of rexmit queue if possible */
         if (tp->retrans_out > 1)
                 return 0;
@@ -1268,7 +1291,11 @@ int tcp_use_frto(struct sock *sk)
  
  /* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
   * recovery a bit and use heuristics in tcp_process_frto() to detect if
- * the RTO was spurious.
+ * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
+ * keep retrans_out counting accurate (with SACK F-RTO, other than head
+ * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
+ * bits are handled if the Loss state is really to be entered (in
+ * tcp_enter_frto_loss).
   *
   * Do like tcp_enter_loss() would; when RTO expires the second time it
   * does:
@@ -1285,27 +1312,56 @@ void tcp_enter_frto(struct sock *sk)
             ((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
              !icsk->icsk_retransmits)) {
                 tp->prior_ssthresh = tcp_current_ssthresh(sk);
-               tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+               /* Our state is too optimistic in ssthresh() call because cwnd
+                * is not reduced until tcp_enter_frto_loss() when previous FRTO
+                * recovery has not yet completed. Pattern would be this: RTO,
+                * Cumulative ACK, RTO (2xRTO for the same segment does not end
+                * up here twice).
+                * RFC4138 should be more specific on what to do, even though
+                * RTO is quite unlikely to occur after the first Cumulative ACK
+                * due to back-off and complexity of triggering events ...
+                */
+               if (tp->frto_counter) {
+                       u32 stored_cwnd;
+                       stored_cwnd = tp->snd_cwnd;
+                       tp->snd_cwnd = 2;
+                       tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+                       tp->snd_cwnd = stored_cwnd;
+               } else {
+                       tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+               }
+               /* ... in theory, cong.control module could do "any tricks" in
+                * ssthresh(), which means that ca_state, lost bits and lost_out
+                * counter would have to be faked before the call occurs. We
+                * consider that too expensive, unlikely and hacky, so modules
+                * using these in ssthresh() must deal these incompatibility
+                * issues if they receives CA_EVENT_FRTO and frto_counter != 0
+                */
                 tcp_ca_event(sk, CA_EVENT_FRTO);
         }
  
-       /* Have to clear retransmission markers here to keep the bookkeeping
-        * in shape, even though we are not yet in Loss state.
-        * If something was really lost, it is eventually caught up
-        * in tcp_enter_frto_loss.
-        */
-       tp->retrans_out = 0;
         tp->undo_marker = tp->snd_una;
         tp->undo_retrans = 0;
  
-       sk_stream_for_retrans_queue(skb, sk) {
+       skb = skb_peek(&sk->sk_write_queue);
+       if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
                 TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+               tp->retrans_out -= tcp_skb_pcount(skb);
         }
         tcp_sync_left_out(tp);
  
+       /* Earlier loss recovery underway (see RFC4138; Appendix B).
+        * The last condition is necessary at least in tp->frto_counter case.
+        */
+       if (IsSackFrto() && (tp->frto_counter ||
+           ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
+           after(tp->high_seq, tp->snd_una)) {
+               tp->frto_highmark = tp->high_seq;
+       } else {
+               tp->frto_highmark = tp->snd_nxt;
+       }
         tcp_set_ca_state(sk, TCP_CA_Disorder);
         tp->high_seq = tp->snd_nxt;
-       tp->frto_highmark = tp->snd_nxt;
         tp->frto_counter = 1;
  }
  
@@ -1313,7 +1369,7 @@ void tcp_enter_frto(struct sock *sk)
   * which indicates that we should follow the traditional RTO recovery,
   * i.e. mark everything lost and do go-back-N retransmission.
   */
-static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments)
+static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
  {
         struct tcp_sock *tp = tcp_sk(sk);
         struct sk_buff *skb;
@@ -1322,10 +1378,21 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments)
         tp->sacked_out = 0;
         tp->lost_out = 0;
         tp->fackets_out = 0;
+       tp->retrans_out = 0;
  
         sk_stream_for_retrans_queue(skb, sk) {
                 cnt += tcp_skb_pcount(skb);
-               TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+               /*
+                * Count the retransmission made on RTO correctly (only when
+                * waiting for the first ACK and did not get it)...
+                */
+               if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) {
+                       tp->retrans_out += tcp_skb_pcount(skb);
+                       /* ...enter this if branch just for the first segment */
+                       flag |= FLAG_DATA_ACKED;
+               } else {
+                       TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
+               }
                 if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
  
                         /* Do not mark those segments lost that were
@@ -1939,7 +2006,7 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag)
                 tp->retrans_stamp = 0;
  
         if (flag&FLAG_ECE)
-               tcp_enter_cwr(sk);
+               tcp_enter_cwr(sk, 1);
  
         if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
                 int state = TCP_CA_Open;
@@ -2515,6 +2582,22 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
         tcp_moderate_cwnd(tp);
  }
  
+/* A conservative spurious RTO response algorithm: reduce cwnd using
+ * rate halving and continue in congestion avoidance.
+ */
+static void tcp_ratehalving_spur_to_response(struct sock *sk)
+{
+       tcp_enter_cwr(sk, 0);
+}
+
+static void tcp_undo_spur_to_response(struct sock *sk, int flag)
+{
+       if (flag&FLAG_ECE)
+               tcp_ratehalving_spur_to_response(sk);
+       else
+               tcp_undo_cwr(sk, 1);
+}
+
  /* F-RTO spurious RTO detection algorithm (RFC4138)
   *
   * F-RTO affects during two new ACKs following RTO (well, almost, see inline
@@ -2530,6 +2613,10 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp)
   * Rationale: if the RTO was spurious, new ACKs should arrive from the
   * original window even after we transmit two new data segments.
   *
+ * SACK version:
+ *   on first step, wait until first cumulative ACK arrives, then move to
+ *   the second step. In second step, the next ACK decides.
+ *
   * F-RTO is implemented (mainly) in four functions:
   *   - tcp_use_frto() is used to determine if TCP is can use F-RTO
   *   - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
@@ -2550,20 +2637,42 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
                 inet_csk(sk)->icsk_retransmits = 0;
  
         if (!before(tp->snd_una, tp->frto_highmark)) {
-               tcp_enter_frto_loss(sk, tp->frto_counter + 1);
+               tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);
                 return 1;
         }
  
-       /* RFC4138 shortcoming in step 2; should also have case c): ACK isn't
-        * duplicate nor advances window, e.g., opposite dir data, winupdate
-        */
-       if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) &&
-           !(flag&FLAG_FORWARD_PROGRESS))
-               return 1;
+       if (!IsSackFrto() || IsReno(tp)) {
+               /* RFC4138 shortcoming in step 2; should also have case c):
+                * ACK isn't duplicate nor advances window, e.g., opposite dir
+                * data, winupdate
+                */
+               if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) &&
+                   !(flag&FLAG_FORWARD_PROGRESS))
+                       return 1;
  
-       if (!(flag&FLAG_DATA_ACKED)) {
-               tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3));
-               return 1;
+               if (!(flag&FLAG_DATA_ACKED)) {
+                       tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
+                                           flag);
+                       return 1;
+               }
+       } else {
+               if (!(flag&FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
+                       /* Prevent sending of new data. */
+                       tp->snd_cwnd = min(tp->snd_cwnd,
+                                          tcp_packets_in_flight(tp));
+                       return 1;
+               }
+
+               if ((tp->frto_counter == 2) &&
+                   (!(flag&FLAG_FORWARD_PROGRESS) ||
+                    ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) {
+                       /* RFC4138 shortcoming (see comment above) */
+                       if (!(flag&FLAG_FORWARD_PROGRESS) && (flag&FLAG_NOT_DUP))
+                               return 1;
+
+                       tcp_enter_frto_loss(sk, 3, flag);
+                       return 1;
+               }
         }
  
         if (tp->frto_counter == 1) {
@@ -2571,7 +2680,17 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag)
                 tp->frto_counter = 2;
                 return 1;
         } else /* frto_counter == 2 */ {
-               tcp_conservative_spur_to_response(tp);
+               switch (sysctl_tcp_frto_response) {
+               case 2:
+                       tcp_undo_spur_to_response(sk, flag);
+                       break;
+               case 1:
+                       tcp_conservative_spur_to_response(tp);
+                       break;
+               default:
+                       tcp_ratehalving_spur_to_response(sk);
+                       break;
+               };
                 tp->frto_counter = 0;
         }
         return 0;
@@ -2814,7 +2933,7 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
  static inline void tcp_store_ts_recent(struct tcp_sock *tp)
  {
         tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
-       tp->rx_opt.ts_recent_stamp = xtime.tv_sec;
+       tp->rx_opt.ts_recent_stamp = get_seconds();
  }
  
  static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
@@ -2828,7 +2947,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
                  */
  
                 if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
-                  xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
+                  get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
                         tcp_store_ts_recent(tp);
         }
  }
@@ -2880,7 +2999,7 @@ static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff *
  {
         const struct tcp_sock *tp = tcp_sk(sk);
         return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
-               xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
+               get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
                 !tcp_disordered_ack(sk, skb));
  }