tcp_dctcp.c 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345
  1. /* DataCenter TCP (DCTCP) congestion control.
  2. *
  3. * http://simula.stanford.edu/~alizade/Site/DCTCP.html
  4. *
  5. * This is an implementation of DCTCP over Reno, an enhancement to the
  6. * TCP congestion control algorithm designed for data centers. DCTCP
  7. * leverages Explicit Congestion Notification (ECN) in the network to
  8. * provide multi-bit feedback to the end hosts. DCTCP's goal is to meet
  9. * the following three data center transport requirements:
  10. *
  11. * - High burst tolerance (incast due to partition/aggregate)
  12. * - Low latency (short flows, queries)
  13. * - High throughput (continuous data updates, large file transfers)
  14. * with commodity shallow buffered switches
  15. *
  16. * The algorithm is described in detail in the following two papers:
  17. *
  18. * 1) Mohammad Alizadeh, Albert Greenberg, David A. Maltz, Jitendra Padhye,
  19. * Parveen Patel, Balaji Prabhakar, Sudipta Sengupta, and Murari Sridharan:
  20. * "Data Center TCP (DCTCP)", Data Center Networks session
  21. * Proc. ACM SIGCOMM, New Delhi, 2010.
  22. * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp-final.pdf
  23. *
  24. * 2) Mohammad Alizadeh, Adel Javanmard, and Balaji Prabhakar:
  25. * "Analysis of DCTCP: Stability, Convergence, and Fairness"
  26. * Proc. ACM SIGMETRICS, San Jose, 2011.
  27. * http://simula.stanford.edu/~alizade/Site/DCTCP_files/dctcp_analysis-full.pdf
  28. *
  29. * Initial prototype from Abdul Kabbani, Masato Yasuda and Mohammad Alizadeh.
  30. *
  31. * Authors:
  32. *
  33. * Daniel Borkmann <dborkman@redhat.com>
  34. * Florian Westphal <fw@strlen.de>
  35. * Glenn Judd <glenn.judd@morganstanley.com>
  36. *
  37. * This program is free software; you can redistribute it and/or modify
  38. * it under the terms of the GNU General Public License as published by
  39. * the Free Software Foundation; either version 2 of the License, or (at
  40. * your option) any later version.
  41. */
  42. #include <linux/module.h>
  43. #include <linux/mm.h>
  44. #include <net/tcp.h>
  45. #include <linux/inet_diag.h>
  46. #define DCTCP_MAX_ALPHA 1024U
  47. struct dctcp {
  48. u32 acked_bytes_ecn;
  49. u32 acked_bytes_total;
  50. u32 prior_snd_una;
  51. u32 prior_rcv_nxt;
  52. u32 dctcp_alpha;
  53. u32 next_seq;
  54. u32 ce_state;
  55. u32 delayed_ack_reserved;
  56. u32 loss_cwnd;
  57. };
  58. static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */
  59. module_param(dctcp_shift_g, uint, 0644);
  60. MODULE_PARM_DESC(dctcp_shift_g, "parameter g for updating dctcp_alpha");
  61. static unsigned int dctcp_alpha_on_init __read_mostly = DCTCP_MAX_ALPHA;
  62. module_param(dctcp_alpha_on_init, uint, 0644);
  63. MODULE_PARM_DESC(dctcp_alpha_on_init, "parameter for initial alpha value");
  64. static unsigned int dctcp_clamp_alpha_on_loss __read_mostly;
  65. module_param(dctcp_clamp_alpha_on_loss, uint, 0644);
  66. MODULE_PARM_DESC(dctcp_clamp_alpha_on_loss,
  67. "parameter for clamping alpha on loss");
  68. static struct tcp_congestion_ops dctcp_reno;
  69. static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
  70. {
  71. ca->next_seq = tp->snd_nxt;
  72. ca->acked_bytes_ecn = 0;
  73. ca->acked_bytes_total = 0;
  74. }
  75. static void dctcp_init(struct sock *sk)
  76. {
  77. const struct tcp_sock *tp = tcp_sk(sk);
  78. if ((tp->ecn_flags & TCP_ECN_OK) ||
  79. (sk->sk_state == TCP_LISTEN ||
  80. sk->sk_state == TCP_CLOSE)) {
  81. struct dctcp *ca = inet_csk_ca(sk);
  82. ca->prior_snd_una = tp->snd_una;
  83. ca->prior_rcv_nxt = tp->rcv_nxt;
  84. ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
  85. ca->delayed_ack_reserved = 0;
  86. ca->loss_cwnd = 0;
  87. ca->ce_state = 0;
  88. dctcp_reset(tp, ca);
  89. return;
  90. }
  91. /* No ECN support? Fall back to Reno. Also need to clear
  92. * ECT from sk since it is set during 3WHS for DCTCP.
  93. */
  94. inet_csk(sk)->icsk_ca_ops = &dctcp_reno;
  95. INET_ECN_dontxmit(sk);
  96. }
  97. static u32 dctcp_ssthresh(struct sock *sk)
  98. {
  99. struct dctcp *ca = inet_csk_ca(sk);
  100. struct tcp_sock *tp = tcp_sk(sk);
  101. ca->loss_cwnd = tp->snd_cwnd;
  102. return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U);
  103. }
  104. /* Minimal DCTP CE state machine:
  105. *
  106. * S: 0 <- last pkt was non-CE
  107. * 1 <- last pkt was CE
  108. */
  109. static void dctcp_ce_state_0_to_1(struct sock *sk)
  110. {
  111. struct dctcp *ca = inet_csk_ca(sk);
  112. struct tcp_sock *tp = tcp_sk(sk);
  113. if (!ca->ce_state) {
  114. /* State has changed from CE=0 to CE=1, force an immediate
  115. * ACK to reflect the new CE state. If an ACK was delayed,
  116. * send that first to reflect the prior CE state.
  117. */
  118. if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
  119. __tcp_send_ack(sk, ca->prior_rcv_nxt);
  120. tcp_enter_quickack_mode(sk);
  121. }
  122. ca->prior_rcv_nxt = tp->rcv_nxt;
  123. ca->ce_state = 1;
  124. tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
  125. }
  126. static void dctcp_ce_state_1_to_0(struct sock *sk)
  127. {
  128. struct dctcp *ca = inet_csk_ca(sk);
  129. struct tcp_sock *tp = tcp_sk(sk);
  130. if (ca->ce_state) {
  131. /* State has changed from CE=1 to CE=0, force an immediate
  132. * ACK to reflect the new CE state. If an ACK was delayed,
  133. * send that first to reflect the prior CE state.
  134. */
  135. if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
  136. __tcp_send_ack(sk, ca->prior_rcv_nxt);
  137. tcp_enter_quickack_mode(sk);
  138. }
  139. ca->prior_rcv_nxt = tp->rcv_nxt;
  140. ca->ce_state = 0;
  141. tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
  142. }
  143. static void dctcp_update_alpha(struct sock *sk, u32 flags)
  144. {
  145. const struct tcp_sock *tp = tcp_sk(sk);
  146. struct dctcp *ca = inet_csk_ca(sk);
  147. u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
  148. /* If ack did not advance snd_una, count dupack as MSS size.
  149. * If ack did update window, do not count it at all.
  150. */
  151. if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
  152. acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
  153. if (acked_bytes) {
  154. ca->acked_bytes_total += acked_bytes;
  155. ca->prior_snd_una = tp->snd_una;
  156. if (flags & CA_ACK_ECE)
  157. ca->acked_bytes_ecn += acked_bytes;
  158. }
  159. /* Expired RTT */
  160. if (!before(tp->snd_una, ca->next_seq)) {
  161. u64 bytes_ecn = ca->acked_bytes_ecn;
  162. u32 alpha = ca->dctcp_alpha;
  163. /* alpha = (1 - g) * alpha + g * F */
  164. alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
  165. if (bytes_ecn) {
  166. /* If dctcp_shift_g == 1, a 32bit value would overflow
  167. * after 8 Mbytes.
  168. */
  169. bytes_ecn <<= (10 - dctcp_shift_g);
  170. do_div(bytes_ecn, max(1U, ca->acked_bytes_total));
  171. alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA);
  172. }
  173. /* dctcp_alpha can be read from dctcp_get_info() without
  174. * synchro, so we ask compiler to not use dctcp_alpha
  175. * as a temporary variable in prior operations.
  176. */
  177. WRITE_ONCE(ca->dctcp_alpha, alpha);
  178. dctcp_reset(tp, ca);
  179. }
  180. }
  181. static void dctcp_state(struct sock *sk, u8 new_state)
  182. {
  183. if (dctcp_clamp_alpha_on_loss && new_state == TCP_CA_Loss) {
  184. struct dctcp *ca = inet_csk_ca(sk);
  185. /* If this extension is enabled, we clamp dctcp_alpha to
  186. * max on packet loss; the motivation is that dctcp_alpha
  187. * is an indicator to the extend of congestion and packet
  188. * loss is an indicator of extreme congestion; setting
  189. * this in practice turned out to be beneficial, and
  190. * effectively assumes total congestion which reduces the
  191. * window by half.
  192. */
  193. ca->dctcp_alpha = DCTCP_MAX_ALPHA;
  194. }
  195. }
  196. static void dctcp_update_ack_reserved(struct sock *sk, enum tcp_ca_event ev)
  197. {
  198. struct dctcp *ca = inet_csk_ca(sk);
  199. switch (ev) {
  200. case CA_EVENT_DELAYED_ACK:
  201. if (!ca->delayed_ack_reserved)
  202. ca->delayed_ack_reserved = 1;
  203. break;
  204. case CA_EVENT_NON_DELAYED_ACK:
  205. if (ca->delayed_ack_reserved)
  206. ca->delayed_ack_reserved = 0;
  207. break;
  208. default:
  209. /* Don't care for the rest. */
  210. break;
  211. }
  212. }
  213. static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev)
  214. {
  215. switch (ev) {
  216. case CA_EVENT_ECN_IS_CE:
  217. dctcp_ce_state_0_to_1(sk);
  218. break;
  219. case CA_EVENT_ECN_NO_CE:
  220. dctcp_ce_state_1_to_0(sk);
  221. break;
  222. case CA_EVENT_DELAYED_ACK:
  223. case CA_EVENT_NON_DELAYED_ACK:
  224. dctcp_update_ack_reserved(sk, ev);
  225. break;
  226. default:
  227. /* Don't care for the rest. */
  228. break;
  229. }
  230. }
  231. static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
  232. union tcp_cc_info *info)
  233. {
  234. const struct dctcp *ca = inet_csk_ca(sk);
  235. /* Fill it also in case of VEGASINFO due to req struct limits.
  236. * We can still correctly retrieve it later.
  237. */
  238. if (ext & (1 << (INET_DIAG_DCTCPINFO - 1)) ||
  239. ext & (1 << (INET_DIAG_VEGASINFO - 1))) {
  240. memset(&info->dctcp, 0, sizeof(info->dctcp));
  241. if (inet_csk(sk)->icsk_ca_ops != &dctcp_reno) {
  242. info->dctcp.dctcp_enabled = 1;
  243. info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
  244. info->dctcp.dctcp_alpha = ca->dctcp_alpha;
  245. info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn;
  246. info->dctcp.dctcp_ab_tot = ca->acked_bytes_total;
  247. }
  248. *attr = INET_DIAG_DCTCPINFO;
  249. return sizeof(info->dctcp);
  250. }
  251. return 0;
  252. }
  253. static u32 dctcp_cwnd_undo(struct sock *sk)
  254. {
  255. const struct dctcp *ca = inet_csk_ca(sk);
  256. return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd);
  257. }
  258. static struct tcp_congestion_ops dctcp __read_mostly = {
  259. .init = dctcp_init,
  260. .in_ack_event = dctcp_update_alpha,
  261. .cwnd_event = dctcp_cwnd_event,
  262. .ssthresh = dctcp_ssthresh,
  263. .cong_avoid = tcp_reno_cong_avoid,
  264. .undo_cwnd = dctcp_cwnd_undo,
  265. .set_state = dctcp_state,
  266. .get_info = dctcp_get_info,
  267. .flags = TCP_CONG_NEEDS_ECN,
  268. .owner = THIS_MODULE,
  269. .name = "dctcp",
  270. };
  271. static struct tcp_congestion_ops dctcp_reno __read_mostly = {
  272. .ssthresh = tcp_reno_ssthresh,
  273. .cong_avoid = tcp_reno_cong_avoid,
  274. .get_info = dctcp_get_info,
  275. .owner = THIS_MODULE,
  276. .name = "dctcp-reno",
  277. };
  278. static int __init dctcp_register(void)
  279. {
  280. BUILD_BUG_ON(sizeof(struct dctcp) > ICSK_CA_PRIV_SIZE);
  281. return tcp_register_congestion_control(&dctcp);
  282. }
  283. static void __exit dctcp_unregister(void)
  284. {
  285. tcp_unregister_congestion_control(&dctcp);
  286. }
  287. module_init(dctcp_register);
  288. module_exit(dctcp_unregister);
  289. MODULE_AUTHOR("Daniel Borkmann <dborkman@redhat.com>");
  290. MODULE_AUTHOR("Florian Westphal <fw@strlen.de>");
  291. MODULE_AUTHOR("Glenn Judd <glenn.judd@morganstanley.com>");
  292. MODULE_LICENSE("GPL v2");
  293. MODULE_DESCRIPTION("DataCenter TCP (DCTCP)");