int sysctl_tcp_rfc1337 __read_mostly;
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
int sysctl_tcp_frto __read_mostly;
+int sysctl_tcp_frto_response __read_mostly;
int sysctl_tcp_nometrics_save __read_mostly;
int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
#define FLAG_ECE 0x40 /* ECE in this ACK */
#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */
#define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/
+#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */
#define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
#define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2)
#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4)
+#define IsSackFrto() (sysctl_tcp_frto == 0x2)
+
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
/* Adapt the MSS value used to make delayed ack decision to the
}
/* Set slow start threshold and cwnd not falling to slow start */
-void tcp_enter_cwr(struct sock *sk)
+void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
{
struct tcp_sock *tp = tcp_sk(sk);
+ const struct inet_connection_sock *icsk = inet_csk(sk);
tp->prior_ssthresh = 0;
tp->bytes_acked = 0;
- if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) {
+ if (icsk->icsk_ca_state < TCP_CA_CWR) {
tp->undo_marker = 0;
- tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk);
+ if (set_ssthresh)
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
tp->snd_cwnd = min(tp->snd_cwnd,
tcp_packets_in_flight(tp) + 1U);
tp->snd_cwnd_cnt = 0;
/* clear lost hint */
tp->retransmit_skb_hint = NULL;
}
+ /* SACK enhanced F-RTO detection.
+ * Set flag if and only if non-rexmitted
+ * segments below frto_highmark are
+ * SACKed (RFC4138; Appendix B).
+ * Clearing correct due to in-order walk
+ */
+ if (after(end_seq, tp->frto_highmark)) {
+ flag &= ~FLAG_ONLY_ORIG_SACKED;
+ } else {
+ if (!(sacked & TCPCB_RETRANS))
+ flag |= FLAG_ONLY_ORIG_SACKED;
+ }
}
TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED;
tp->left_out = tp->sacked_out + tp->lost_out;
- if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss)
+ if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss &&
+ (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark)))
tcp_update_reordering(sk, ((tp->fackets_out + 1) - reord), 0);
#if FASTRETRANS_DEBUG > 0
/* F-RTO can only be used if these conditions are satisfied:
* - there must be some unsent new data
* - the advertised window should allow sending it
- * - TCP has never retransmitted anything other than head
+ * - TCP has never retransmitted anything other than head (SACK enhanced
+ * variant from Appendix B of RFC4138 is more robust here)
*/
int tcp_use_frto(struct sock *sk)
{
tp->snd_una + tp->snd_wnd))
return 0;
+ if (IsSackFrto())
+ return 1;
+
/* Avoid expensive walking of rexmit queue if possible */
if (tp->retrans_out > 1)
return 0;
/* RTO occurred, but do not yet enter Loss state. Instead, defer RTO
* recovery a bit and use heuristics in tcp_process_frto() to detect if
- * the RTO was spurious.
+ * the RTO was spurious. Only clear SACKED_RETRANS of the head here to
+ * keep retrans_out counting accurate (with SACK F-RTO, other than head
+ * may still have that bit set); TCPCB_LOST and remaining SACKED_RETRANS
+ * bits are handled if the Loss state is really to be entered (in
+ * tcp_enter_frto_loss).
*
* Do like tcp_enter_loss() would; when RTO expires the second time it
* does:
((icsk->icsk_ca_state == TCP_CA_Loss || tp->frto_counter) &&
!icsk->icsk_retransmits)) {
tp->prior_ssthresh = tcp_current_ssthresh(sk);
- tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+ /* Our state is too optimistic in ssthresh() call because cwnd
+ * is not reduced until tcp_enter_frto_loss() when previous FRTO
+ * recovery has not yet completed. Pattern would be this: RTO,
+ * Cumulative ACK, RTO (2xRTO for the same segment does not end
+ * up here twice).
+ * RFC4138 should be more specific on what to do, even though
+ * RTO is quite unlikely to occur after the first Cumulative ACK
+ * due to back-off and complexity of triggering events ...
+ */
+ if (tp->frto_counter) {
+ u32 stored_cwnd;
+ stored_cwnd = tp->snd_cwnd;
+ tp->snd_cwnd = 2;
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+ tp->snd_cwnd = stored_cwnd;
+ } else {
+ tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+ }
+ /* ... in theory, cong.control module could do "any tricks" in
+ * ssthresh(), which means that ca_state, lost bits and lost_out
+ * counter would have to be faked before the call occurs. We
+ * consider that too expensive, unlikely and hacky, so modules
+ * using these in ssthresh() must deal these incompatibility
+ * issues if they receives CA_EVENT_FRTO and frto_counter != 0
+ */
tcp_ca_event(sk, CA_EVENT_FRTO);
}
- /* Have to clear retransmission markers here to keep the bookkeeping
- * in shape, even though we are not yet in Loss state.
- * If something was really lost, it is eventually caught up
- * in tcp_enter_frto_loss.
- */
- tp->retrans_out = 0;
tp->undo_marker = tp->snd_una;
tp->undo_retrans = 0;
- sk_stream_for_retrans_queue(skb, sk) {
+ skb = skb_peek(&sk->sk_write_queue);
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
+ tp->retrans_out -= tcp_skb_pcount(skb);
}
tcp_sync_left_out(tp);
+ /* Earlier loss recovery underway (see RFC4138; Appendix B).
+ * The last condition is necessary at least in tp->frto_counter case.
+ */
+ if (IsSackFrto() && (tp->frto_counter ||
+ ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) &&
+ after(tp->high_seq, tp->snd_una)) {
+ tp->frto_highmark = tp->high_seq;
+ } else {
+ tp->frto_highmark = tp->snd_nxt;
+ }
tcp_set_ca_state(sk, TCP_CA_Disorder);
tp->high_seq = tp->snd_nxt;
- tp->frto_highmark = tp->snd_nxt;
tp->frto_counter = 1;
}
* which indicates that we should follow the traditional RTO recovery,
* i.e. mark everything lost and do go-back-N retransmission.
*/
-static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments)
+static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
tp->sacked_out = 0;
tp->lost_out = 0;
tp->fackets_out = 0;
+ tp->retrans_out = 0;
sk_stream_for_retrans_queue(skb, sk) {
cnt += tcp_skb_pcount(skb);
- TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
+ /*
+ * Count the retransmission made on RTO correctly (only when
+ * waiting for the first ACK and did not get it)...
+ */
+ if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) {
+ tp->retrans_out += tcp_skb_pcount(skb);
+ /* ...enter this if branch just for the first segment */
+ flag |= FLAG_DATA_ACKED;
+ } else {
+ TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
+ }
if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
/* Do not mark those segments lost that were
tp->retrans_stamp = 0;
if (flag&FLAG_ECE)
- tcp_enter_cwr(sk);
+ tcp_enter_cwr(sk, 1);
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
int state = TCP_CA_Open;
tcp_moderate_cwnd(tp);
}
+/* A conservative spurious RTO response algorithm: reduce cwnd using
+ * rate halving and continue in congestion avoidance.
+ */
+static void tcp_ratehalving_spur_to_response(struct sock *sk)
+{
+ tcp_enter_cwr(sk, 0);
+}
+
+static void tcp_undo_spur_to_response(struct sock *sk, int flag)
+{
+ if (flag&FLAG_ECE)
+ tcp_ratehalving_spur_to_response(sk);
+ else
+ tcp_undo_cwr(sk, 1);
+}
+
/* F-RTO spurious RTO detection algorithm (RFC4138)
*
* F-RTO affects during two new ACKs following RTO (well, almost, see inline
* Rationale: if the RTO was spurious, new ACKs should arrive from the
* original window even after we transmit two new data segments.
*
+ * SACK version:
+ * on first step, wait until first cumulative ACK arrives, then move to
+ * the second step. In second step, the next ACK decides.
+ *
* F-RTO is implemented (mainly) in four functions:
* - tcp_use_frto() is used to determine if TCP is can use F-RTO
* - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is
inet_csk(sk)->icsk_retransmits = 0;
if (!before(tp->snd_una, tp->frto_highmark)) {
- tcp_enter_frto_loss(sk, tp->frto_counter + 1);
+ tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag);
return 1;
}
- /* RFC4138 shortcoming in step 2; should also have case c): ACK isn't
- * duplicate nor advances window, e.g., opposite dir data, winupdate
- */
- if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) &&
- !(flag&FLAG_FORWARD_PROGRESS))
- return 1;
+ if (!IsSackFrto() || IsReno(tp)) {
+ /* RFC4138 shortcoming in step 2; should also have case c):
+ * ACK isn't duplicate nor advances window, e.g., opposite dir
+ * data, winupdate
+ */
+ if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) &&
+ !(flag&FLAG_FORWARD_PROGRESS))
+ return 1;
- if (!(flag&FLAG_DATA_ACKED)) {
- tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3));
- return 1;
+ if (!(flag&FLAG_DATA_ACKED)) {
+ tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3),
+ flag);
+ return 1;
+ }
+ } else {
+ if (!(flag&FLAG_DATA_ACKED) && (tp->frto_counter == 1)) {
+ /* Prevent sending of new data. */
+ tp->snd_cwnd = min(tp->snd_cwnd,
+ tcp_packets_in_flight(tp));
+ return 1;
+ }
+
+ if ((tp->frto_counter == 2) &&
+ (!(flag&FLAG_FORWARD_PROGRESS) ||
+ ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) {
+ /* RFC4138 shortcoming (see comment above) */
+ if (!(flag&FLAG_FORWARD_PROGRESS) && (flag&FLAG_NOT_DUP))
+ return 1;
+
+ tcp_enter_frto_loss(sk, 3, flag);
+ return 1;
+ }
}
if (tp->frto_counter == 1) {
tp->frto_counter = 2;
return 1;
} else /* frto_counter == 2 */ {
- tcp_conservative_spur_to_response(tp);
+ switch (sysctl_tcp_frto_response) {
+ case 2:
+ tcp_undo_spur_to_response(sk, flag);
+ break;
+ case 1:
+ tcp_conservative_spur_to_response(tp);
+ break;
+ default:
+ tcp_ratehalving_spur_to_response(sk);
+ break;
+ };
tp->frto_counter = 0;
}
return 0;
static inline void tcp_store_ts_recent(struct tcp_sock *tp)
{
tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval;
- tp->rx_opt.ts_recent_stamp = xtime.tv_sec;
+ tp->rx_opt.ts_recent_stamp = get_seconds();
}
static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq)
*/
if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 ||
- xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
+ get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS)
tcp_store_ts_recent(tp);
}
}
{
const struct tcp_sock *tp = tcp_sk(sk);
return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW &&
- xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
+ get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS &&
!tcp_disordered_ack(sk, skb));
}