tcp_sack.c 36 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168
  1. /*-
  2. * SPDX-License-Identifier: BSD-3-Clause
  3. *
  4. * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
  5. * The Regents of the University of California.
  6. * All rights reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. * 3. Neither the name of the University nor the names of its contributors
  17. * may be used to endorse or promote products derived from this software
  18. * without specific prior written permission.
  19. *
  20. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  21. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  24. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  25. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  26. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  27. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  28. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  29. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  30. * SUCH DAMAGE.
  31. */
  32. /*-
  33. *
  34. * NRL grants permission for redistribution and use in source and binary
  35. * forms, with or without modification, of the software and documentation
  36. * created at NRL provided that the following conditions are met:
  37. *
  38. * 1. Redistributions of source code must retain the above copyright
  39. * notice, this list of conditions and the following disclaimer.
  40. * 2. Redistributions in binary form must reproduce the above copyright
  41. * notice, this list of conditions and the following disclaimer in the
  42. * documentation and/or other materials provided with the distribution.
  43. * 3. All advertising materials mentioning features or use of this software
  44. * must display the following acknowledgements:
  45. * This product includes software developed by the University of
  46. * California, Berkeley and its contributors.
  47. * This product includes software developed at the Information
  48. * Technology Division, US Naval Research Laboratory.
  49. * 4. Neither the name of the NRL nor the names of its contributors
  50. * may be used to endorse or promote products derived from this software
  51. * without specific prior written permission.
  52. *
  53. * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
  54. * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
  55. * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  56. * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
  57. * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
  58. * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
  59. * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
  60. * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
  61. * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  62. * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
  63. * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  64. *
  65. * The views and conclusions contained in the software and documentation
  66. * are those of the authors and should not be interpreted as representing
  67. * official policies, either expressed or implied, of the US Naval
  68. * Research Laboratory (NRL).
  69. */
  70. #include <sys/cdefs.h>
  71. #include "opt_inet.h"
  72. #include "opt_inet6.h"
  73. #include <sys/param.h>
  74. #include <sys/systm.h>
  75. #include <sys/kernel.h>
  76. #include <sys/sysctl.h>
  77. #include <sys/malloc.h>
  78. #include <sys/mbuf.h>
  79. #include <sys/proc.h> /* for proc0 declaration */
  80. #include <sys/protosw.h>
  81. #include <sys/socket.h>
  82. #include <sys/socketvar.h>
  83. #include <sys/syslog.h>
  84. #include <sys/systm.h>
  85. #include <machine/cpu.h> /* before tcp_seq.h, for tcp_random18() */
  86. #include <vm/uma.h>
  87. #include <net/if.h>
  88. #include <net/if_var.h>
  89. #include <net/route.h>
  90. #include <net/vnet.h>
  91. #include <netinet/in.h>
  92. #include <netinet/in_systm.h>
  93. #include <netinet/ip.h>
  94. #include <netinet/in_var.h>
  95. #include <netinet/in_pcb.h>
  96. #include <netinet/ip_var.h>
  97. #include <netinet/ip6.h>
  98. #include <netinet/icmp6.h>
  99. #include <netinet6/nd6.h>
  100. #include <netinet6/ip6_var.h>
  101. #include <netinet6/in6_pcb.h>
  102. #include <netinet/tcp.h>
  103. #include <netinet/tcp_fsm.h>
  104. #include <netinet/tcp_seq.h>
  105. #include <netinet/tcp_timer.h>
  106. #include <netinet/tcp_var.h>
  107. #include <netinet/tcpip.h>
  108. #include <netinet/cc/cc.h>
  109. #include <machine/in_cksum.h>
  110. VNET_DECLARE(struct uma_zone *, sack_hole_zone);
  111. #define V_sack_hole_zone VNET(sack_hole_zone)
  112. SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  113. "TCP SACK");
  114. VNET_DEFINE(int, tcp_do_sack) = 1;
  115. SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
  116. &VNET_NAME(tcp_do_sack), 0,
  117. "Enable/Disable TCP SACK support");
  118. VNET_DEFINE(int, tcp_do_newsack) = 1;
  119. SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, revised, CTLFLAG_VNET | CTLFLAG_RW,
  120. &VNET_NAME(tcp_do_newsack), 0,
  121. "Use revised SACK loss recovery per RFC 6675");
  122. VNET_DEFINE(int, tcp_do_lrd) = 1;
  123. SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, lrd, CTLFLAG_VNET | CTLFLAG_RW,
  124. &VNET_NAME(tcp_do_lrd), 1,
  125. "Perform Lost Retransmission Detection");
  126. VNET_DEFINE(int, tcp_sack_tso) = 0;
  127. SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, tso, CTLFLAG_VNET | CTLFLAG_RW,
  128. &VNET_NAME(tcp_sack_tso), 0,
  129. "Allow TSO during SACK loss recovery");
  130. VNET_DEFINE(int, tcp_sack_maxholes) = 128;
  131. SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_VNET | CTLFLAG_RW,
  132. &VNET_NAME(tcp_sack_maxholes), 0,
  133. "Maximum number of TCP SACK holes allowed per connection");
  134. VNET_DEFINE(int, tcp_sack_globalmaxholes) = 65536;
  135. SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalmaxholes, CTLFLAG_VNET | CTLFLAG_RW,
  136. &VNET_NAME(tcp_sack_globalmaxholes), 0,
  137. "Global maximum number of TCP SACK holes");
  138. VNET_DEFINE(int, tcp_sack_globalholes) = 0;
  139. SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, globalholes, CTLFLAG_VNET | CTLFLAG_RD,
  140. &VNET_NAME(tcp_sack_globalholes), 0,
  141. "Global number of TCP SACK holes currently allocated");
  142. int
  143. tcp_dsack_block_exists(struct tcpcb *tp)
  144. {
  145. /* Return true if a DSACK block exists */
  146. if (tp->rcv_numsacks == 0)
  147. return (0);
  148. if (SEQ_LEQ(tp->sackblks[0].end, tp->rcv_nxt))
  149. return(1);
  150. return (0);
  151. }
  152. /*
  153. * This function will find overlaps with the currently stored sackblocks
  154. * and add any overlap as a dsack block upfront
  155. */
  156. void
  157. tcp_update_dsack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
  158. {
  159. struct sackblk head_blk,mid_blk,saved_blks[MAX_SACK_BLKS];
  160. int i, j, n, identical;
  161. tcp_seq start, end;
  162. INP_WLOCK_ASSERT(tptoinpcb(tp));
  163. KASSERT(SEQ_LT(rcv_start, rcv_end), ("rcv_start < rcv_end"));
  164. if (SEQ_LT(rcv_end, tp->rcv_nxt) ||
  165. ((rcv_end == tp->rcv_nxt) &&
  166. (tp->rcv_numsacks > 0 ) &&
  167. (tp->sackblks[0].end == tp->rcv_nxt))) {
  168. saved_blks[0].start = rcv_start;
  169. saved_blks[0].end = rcv_end;
  170. } else {
  171. saved_blks[0].start = saved_blks[0].end = 0;
  172. }
  173. head_blk.start = head_blk.end = 0;
  174. mid_blk.start = rcv_start;
  175. mid_blk.end = rcv_end;
  176. identical = 0;
  177. for (i = 0; i < tp->rcv_numsacks; i++) {
  178. start = tp->sackblks[i].start;
  179. end = tp->sackblks[i].end;
  180. if (SEQ_LT(rcv_end, start)) {
  181. /* pkt left to sack blk */
  182. continue;
  183. }
  184. if (SEQ_GT(rcv_start, end)) {
  185. /* pkt right to sack blk */
  186. continue;
  187. }
  188. if (SEQ_GT(tp->rcv_nxt, end)) {
  189. if ((SEQ_MAX(rcv_start, start) != SEQ_MIN(rcv_end, end)) &&
  190. (SEQ_GT(head_blk.start, SEQ_MAX(rcv_start, start)) ||
  191. (head_blk.start == head_blk.end))) {
  192. head_blk.start = SEQ_MAX(rcv_start, start);
  193. head_blk.end = SEQ_MIN(rcv_end, end);
  194. }
  195. continue;
  196. }
  197. if (((head_blk.start == head_blk.end) ||
  198. SEQ_LT(start, head_blk.start)) &&
  199. (SEQ_GT(end, rcv_start) &&
  200. SEQ_LEQ(start, rcv_end))) {
  201. head_blk.start = start;
  202. head_blk.end = end;
  203. }
  204. mid_blk.start = SEQ_MIN(mid_blk.start, start);
  205. mid_blk.end = SEQ_MAX(mid_blk.end, end);
  206. if ((mid_blk.start == start) &&
  207. (mid_blk.end == end))
  208. identical = 1;
  209. }
  210. if (SEQ_LT(head_blk.start, head_blk.end)) {
  211. /* store overlapping range */
  212. saved_blks[0].start = SEQ_MAX(rcv_start, head_blk.start);
  213. saved_blks[0].end = SEQ_MIN(rcv_end, head_blk.end);
  214. }
  215. n = 1;
  216. /*
  217. * Second, if not ACKed, store the SACK block that
  218. * overlaps with the DSACK block unless it is identical
  219. */
  220. if ((SEQ_LT(tp->rcv_nxt, mid_blk.end) &&
  221. !((mid_blk.start == saved_blks[0].start) &&
  222. (mid_blk.end == saved_blks[0].end))) ||
  223. identical == 1) {
  224. saved_blks[n].start = mid_blk.start;
  225. saved_blks[n++].end = mid_blk.end;
  226. }
  227. for (j = 0; (j < tp->rcv_numsacks) && (n < MAX_SACK_BLKS); j++) {
  228. if (((SEQ_LT(tp->sackblks[j].end, mid_blk.start) ||
  229. SEQ_GT(tp->sackblks[j].start, mid_blk.end)) &&
  230. (SEQ_GT(tp->sackblks[j].start, tp->rcv_nxt))))
  231. saved_blks[n++] = tp->sackblks[j];
  232. }
  233. j = 0;
  234. for (i = 0; i < n; i++) {
  235. /* we can end up with a stale initial entry */
  236. if (SEQ_LT(saved_blks[i].start, saved_blks[i].end)) {
  237. tp->sackblks[j++] = saved_blks[i];
  238. }
  239. }
  240. tp->rcv_numsacks = j;
  241. }
  242. /*
  243. * This function is called upon receipt of new valid data (while not in
  244. * header prediction mode), and it updates the ordered list of sacks.
  245. */
  246. void
  247. tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_start, tcp_seq rcv_end)
  248. {
  249. /*
  250. * First reported block MUST be the most recent one. Subsequent
  251. * blocks SHOULD be in the order in which they arrived at the
  252. * receiver. These two conditions make the implementation fully
  253. * compliant with RFC 2018.
  254. */
  255. struct sackblk head_blk, saved_blks[MAX_SACK_BLKS];
  256. int num_head, num_saved, i;
  257. INP_WLOCK_ASSERT(tptoinpcb(tp));
  258. /* Check arguments. */
  259. KASSERT(SEQ_LEQ(rcv_start, rcv_end), ("rcv_start <= rcv_end"));
  260. if ((rcv_start == rcv_end) &&
  261. (tp->rcv_numsacks >= 1) &&
  262. (rcv_end == tp->sackblks[0].end)) {
  263. /* retaining DSACK block below rcv_nxt (todrop) */
  264. head_blk = tp->sackblks[0];
  265. } else {
  266. /* SACK block for the received segment. */
  267. head_blk.start = rcv_start;
  268. head_blk.end = rcv_end;
  269. }
  270. /*
  271. * Merge updated SACK blocks into head_blk, and save unchanged SACK
  272. * blocks into saved_blks[]. num_saved will have the number of the
  273. * saved SACK blocks.
  274. */
  275. num_saved = 0;
  276. for (i = 0; i < tp->rcv_numsacks; i++) {
  277. tcp_seq start = tp->sackblks[i].start;
  278. tcp_seq end = tp->sackblks[i].end;
  279. if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
  280. /*
  281. * Discard this SACK block.
  282. */
  283. } else if (SEQ_LEQ(head_blk.start, end) &&
  284. SEQ_GEQ(head_blk.end, start)) {
  285. /*
  286. * Merge this SACK block into head_blk. This SACK
  287. * block itself will be discarded.
  288. */
  289. /*
  290. * |-|
  291. * |---| merge
  292. *
  293. * |-|
  294. * |---| merge
  295. *
  296. * |-----|
  297. * |-| DSACK smaller
  298. *
  299. * |-|
  300. * |-----| DSACK smaller
  301. */
  302. if (head_blk.start == end)
  303. head_blk.start = start;
  304. else if (head_blk.end == start)
  305. head_blk.end = end;
  306. else {
  307. if (SEQ_LT(head_blk.start, start)) {
  308. tcp_seq temp = start;
  309. start = head_blk.start;
  310. head_blk.start = temp;
  311. }
  312. if (SEQ_GT(head_blk.end, end)) {
  313. tcp_seq temp = end;
  314. end = head_blk.end;
  315. head_blk.end = temp;
  316. }
  317. if ((head_blk.start != start) ||
  318. (head_blk.end != end)) {
  319. if ((num_saved >= 1) &&
  320. SEQ_GEQ(saved_blks[num_saved-1].start, start) &&
  321. SEQ_LEQ(saved_blks[num_saved-1].end, end))
  322. num_saved--;
  323. saved_blks[num_saved].start = start;
  324. saved_blks[num_saved].end = end;
  325. num_saved++;
  326. }
  327. }
  328. } else {
  329. /*
  330. * This block supercedes the prior block
  331. */
  332. if ((num_saved >= 1) &&
  333. SEQ_GEQ(saved_blks[num_saved-1].start, start) &&
  334. SEQ_LEQ(saved_blks[num_saved-1].end, end))
  335. num_saved--;
  336. /*
  337. * Save this SACK block.
  338. */
  339. saved_blks[num_saved].start = start;
  340. saved_blks[num_saved].end = end;
  341. num_saved++;
  342. }
  343. }
  344. /*
  345. * Update SACK list in tp->sackblks[].
  346. */
  347. num_head = 0;
  348. if (SEQ_LT(rcv_start, rcv_end)) {
  349. /*
  350. * The received data segment is an out-of-order segment. Put
  351. * head_blk at the top of SACK list.
  352. */
  353. tp->sackblks[0] = head_blk;
  354. num_head = 1;
  355. /*
  356. * If the number of saved SACK blocks exceeds its limit,
  357. * discard the last SACK block.
  358. */
  359. if (num_saved >= MAX_SACK_BLKS)
  360. num_saved--;
  361. }
  362. if ((rcv_start == rcv_end) &&
  363. (rcv_start == tp->sackblks[0].end)) {
  364. num_head = 1;
  365. }
  366. if (num_saved > 0) {
  367. /*
  368. * Copy the saved SACK blocks back.
  369. */
  370. bcopy(saved_blks, &tp->sackblks[num_head],
  371. sizeof(struct sackblk) * num_saved);
  372. }
  373. /* Save the number of SACK blocks. */
  374. tp->rcv_numsacks = num_head + num_saved;
  375. }
  376. void
  377. tcp_clean_dsack_blocks(struct tcpcb *tp)
  378. {
  379. struct sackblk saved_blks[MAX_SACK_BLKS];
  380. int num_saved, i;
  381. INP_WLOCK_ASSERT(tptoinpcb(tp));
  382. /*
  383. * Clean up any DSACK blocks that
  384. * are in our queue of sack blocks.
  385. *
  386. */
  387. num_saved = 0;
  388. for (i = 0; i < tp->rcv_numsacks; i++) {
  389. tcp_seq start = tp->sackblks[i].start;
  390. tcp_seq end = tp->sackblks[i].end;
  391. if (SEQ_GEQ(start, end) || SEQ_LEQ(start, tp->rcv_nxt)) {
  392. /*
  393. * Discard this D-SACK block.
  394. */
  395. continue;
  396. }
  397. /*
  398. * Save this SACK block.
  399. */
  400. saved_blks[num_saved].start = start;
  401. saved_blks[num_saved].end = end;
  402. num_saved++;
  403. }
  404. if (num_saved > 0) {
  405. /*
  406. * Copy the saved SACK blocks back.
  407. */
  408. bcopy(saved_blks, &tp->sackblks[0],
  409. sizeof(struct sackblk) * num_saved);
  410. }
  411. tp->rcv_numsacks = num_saved;
  412. }
  413. /*
  414. * Delete all receiver-side SACK information.
  415. */
  416. void
  417. tcp_clean_sackreport(struct tcpcb *tp)
  418. {
  419. int i;
  420. INP_WLOCK_ASSERT(tptoinpcb(tp));
  421. tp->rcv_numsacks = 0;
  422. for (i = 0; i < MAX_SACK_BLKS; i++)
  423. tp->sackblks[i].start = tp->sackblks[i].end=0;
  424. }
  425. /*
  426. * Allocate struct sackhole.
  427. */
  428. static struct sackhole *
  429. tcp_sackhole_alloc(struct tcpcb *tp, tcp_seq start, tcp_seq end)
  430. {
  431. struct sackhole *hole;
  432. if (tp->snd_numholes >= V_tcp_sack_maxholes ||
  433. V_tcp_sack_globalholes >= V_tcp_sack_globalmaxholes) {
  434. TCPSTAT_INC(tcps_sack_sboverflow);
  435. return NULL;
  436. }
  437. hole = (struct sackhole *)uma_zalloc(V_sack_hole_zone, M_NOWAIT);
  438. if (hole == NULL)
  439. return NULL;
  440. hole->start = start;
  441. hole->end = end;
  442. hole->rxmit = start;
  443. tp->snd_numholes++;
  444. atomic_add_int(&V_tcp_sack_globalholes, 1);
  445. return hole;
  446. }
  447. /*
  448. * Free struct sackhole.
  449. */
  450. static void
  451. tcp_sackhole_free(struct tcpcb *tp, struct sackhole *hole)
  452. {
  453. uma_zfree(V_sack_hole_zone, hole);
  454. tp->snd_numholes--;
  455. atomic_subtract_int(&V_tcp_sack_globalholes, 1);
  456. KASSERT(tp->snd_numholes >= 0, ("tp->snd_numholes >= 0"));
  457. KASSERT(V_tcp_sack_globalholes >= 0, ("tcp_sack_globalholes >= 0"));
  458. }
  459. /*
  460. * Insert new SACK hole into scoreboard.
  461. */
  462. static struct sackhole *
  463. tcp_sackhole_insert(struct tcpcb *tp, tcp_seq start, tcp_seq end,
  464. struct sackhole *after)
  465. {
  466. struct sackhole *hole;
  467. /* Allocate a new SACK hole. */
  468. hole = tcp_sackhole_alloc(tp, start, end);
  469. if (hole == NULL)
  470. return NULL;
  471. /* Insert the new SACK hole into scoreboard. */
  472. if (after != NULL)
  473. TAILQ_INSERT_AFTER(&tp->snd_holes, after, hole, scblink);
  474. else
  475. TAILQ_INSERT_TAIL(&tp->snd_holes, hole, scblink);
  476. /* Update SACK hint. */
  477. if (tp->sackhint.nexthole == NULL)
  478. tp->sackhint.nexthole = hole;
  479. return hole;
  480. }
  481. /*
  482. * Remove SACK hole from scoreboard.
  483. */
  484. static void
  485. tcp_sackhole_remove(struct tcpcb *tp, struct sackhole *hole)
  486. {
  487. /* Update SACK hint. */
  488. if (tp->sackhint.nexthole == hole)
  489. tp->sackhint.nexthole = TAILQ_NEXT(hole, scblink);
  490. /* Remove this SACK hole. */
  491. TAILQ_REMOVE(&tp->snd_holes, hole, scblink);
  492. /* Free this SACK hole. */
  493. tcp_sackhole_free(tp, hole);
  494. }
  495. /*
  496. * Process cumulative ACK and the TCP SACK option to update the scoreboard.
  497. * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of
  498. * the sequence space).
  499. * Returns SACK_NEWLOSS if incoming ACK indicates ongoing loss (hole split, new hole),
  500. * SACK_CHANGE if incoming ACK has previously unknown SACK information,
  501. * SACK_NOCHANGE otherwise.
  502. */
  503. sackstatus_t
  504. tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack)
  505. {
  506. struct sackhole *cur, *temp;
  507. struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp;
  508. int i, j, num_sack_blks;
  509. sackstatus_t sack_changed;
  510. int delivered_data, left_edge_delta;
  511. int maxseg = tp->t_maxseg - MAX_TCPOPTLEN;
  512. tcp_seq loss_hiack = 0;
  513. int loss_thresh = 0;
  514. int loss_sblks = 0;
  515. int notlost_bytes = 0;
  516. INP_WLOCK_ASSERT(tptoinpcb(tp));
  517. num_sack_blks = 0;
  518. sack_changed = SACK_NOCHANGE;
  519. delivered_data = 0;
  520. left_edge_delta = 0;
  521. /*
  522. * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist,
  523. * treat [SND.UNA, SEG.ACK) as if it is a SACK block.
  524. * Account changes to SND.UNA always in delivered data.
  525. */
  526. if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) {
  527. left_edge_delta = th_ack - tp->snd_una;
  528. sack_blocks[num_sack_blks].start = tp->snd_una;
  529. sack_blocks[num_sack_blks++].end = th_ack;
  530. /*
  531. * Pulling snd_fack forward if we got here
  532. * due to DSACK blocks
  533. */
  534. if (SEQ_LT(tp->snd_fack, th_ack)) {
  535. delivered_data += th_ack - tp->snd_una;
  536. tp->snd_fack = th_ack;
  537. sack_changed = SACK_CHANGE;
  538. }
  539. }
  540. /*
  541. * Append received valid SACK blocks to sack_blocks[], but only if we
  542. * received new blocks from the other side.
  543. */
  544. if (to->to_flags & TOF_SACK) {
  545. for (i = 0; i < to->to_nsacks; i++) {
  546. bcopy((to->to_sacks + i * TCPOLEN_SACK),
  547. &sack, sizeof(sack));
  548. sack.start = ntohl(sack.start);
  549. sack.end = ntohl(sack.end);
  550. if (SEQ_GT(sack.end, sack.start) &&
  551. SEQ_GT(sack.start, tp->snd_una) &&
  552. SEQ_GT(sack.start, th_ack) &&
  553. SEQ_LT(sack.start, tp->snd_max) &&
  554. SEQ_GT(sack.end, tp->snd_una) &&
  555. SEQ_LEQ(sack.end, tp->snd_max) &&
  556. ((sack.end - sack.start) >= maxseg ||
  557. SEQ_GEQ(sack.end, tp->snd_max))) {
  558. sack_blocks[num_sack_blks++] = sack;
  559. } else if (SEQ_LEQ(sack.start, th_ack) &&
  560. SEQ_LEQ(sack.end, th_ack)) {
  561. /*
  562. * Its a D-SACK block.
  563. */
  564. tcp_record_dsack(tp, sack.start, sack.end, 0);
  565. }
  566. }
  567. }
  568. /*
  569. * Return if SND.UNA is not advanced and no valid SACK block is
  570. * received.
  571. */
  572. if (num_sack_blks == 0)
  573. return (sack_changed);
  574. /*
  575. * Sort the SACK blocks so we can update the scoreboard with just one
  576. * pass. The overhead of sorting up to 4+1 elements is less than
  577. * making up to 4+1 passes over the scoreboard.
  578. */
  579. for (i = 0; i < num_sack_blks; i++) {
  580. for (j = i + 1; j < num_sack_blks; j++) {
  581. if (SEQ_GT(sack_blocks[i].end, sack_blocks[j].end)) {
  582. sack = sack_blocks[i];
  583. sack_blocks[i] = sack_blocks[j];
  584. sack_blocks[j] = sack;
  585. }
  586. }
  587. }
  588. if (TAILQ_EMPTY(&tp->snd_holes)) {
  589. /*
  590. * Empty scoreboard. Need to initialize snd_fack (it may be
  591. * uninitialized or have a bogus value). Scoreboard holes
  592. * (from the sack blocks received) are created later below
  593. * (in the logic that adds holes to the tail of the
  594. * scoreboard).
  595. */
  596. tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack);
  597. tp->sackhint.sacked_bytes = 0; /* reset */
  598. tp->sackhint.hole_bytes = 0;
  599. }
  600. /*
  601. * In the while-loop below, incoming SACK blocks (sack_blocks[]) and
  602. * SACK holes (snd_holes) are traversed from their tails with just
  603. * one pass in order to reduce the number of compares especially when
  604. * the bandwidth-delay product is large.
  605. *
  606. * Note: Typically, in the first RTT of SACK recovery, the highest
  607. * three or four SACK blocks with the same ack number are received.
  608. * In the second RTT, if retransmitted data segments are not lost,
  609. * the highest three or four SACK blocks with ack number advancing
  610. * are received.
  611. */
  612. sblkp = &sack_blocks[num_sack_blks - 1]; /* Last SACK block */
  613. tp->sackhint.last_sack_ack = sblkp->end;
  614. if (SEQ_LT(tp->snd_fack, sblkp->start)) {
  615. /*
  616. * The highest SACK block is beyond fack. First,
  617. * check if there was a successful Rescue Retransmission,
  618. * and move this hole left. With normal holes, snd_fack
  619. * is always to the right of the end.
  620. */
  621. if (((temp = TAILQ_LAST(&tp->snd_holes, sackhole_head)) != NULL) &&
  622. SEQ_LEQ(tp->snd_fack,temp->end)) {
  623. tp->sackhint.hole_bytes -= temp->end - temp->start;
  624. temp->start = SEQ_MAX(tp->snd_fack, SEQ_MAX(tp->snd_una, th_ack));
  625. temp->end = sblkp->start;
  626. temp->rxmit = temp->start;
  627. delivered_data += sblkp->end - sblkp->start;
  628. tp->sackhint.hole_bytes += temp->end - temp->start;
  629. KASSERT(tp->sackhint.hole_bytes >= 0,
  630. ("sackhint hole bytes >= 0"));
  631. tp->snd_fack = sblkp->end;
  632. sblkp--;
  633. sack_changed = SACK_NEWLOSS;
  634. } else {
  635. /*
  636. * Append a new SACK hole at the tail. If the
  637. * second or later highest SACK blocks are also
  638. * beyond the current fack, they will be inserted
  639. * by way of hole splitting in the while-loop below.
  640. */
  641. temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL);
  642. if (temp != NULL) {
  643. delivered_data += sblkp->end - sblkp->start;
  644. tp->sackhint.hole_bytes += temp->end - temp->start;
  645. tp->snd_fack = sblkp->end;
  646. /* Go to the previous sack block. */
  647. sblkp--;
  648. sack_changed = SACK_CHANGE;
  649. } else {
  650. /*
  651. * We failed to add a new hole based on the current
  652. * sack block. Skip over all the sack blocks that
  653. * fall completely to the right of snd_fack and
  654. * proceed to trim the scoreboard based on the
  655. * remaining sack blocks. This also trims the
  656. * scoreboard for th_ack (which is sack_blocks[0]).
  657. */
  658. while (sblkp >= sack_blocks &&
  659. SEQ_LT(tp->snd_fack, sblkp->start))
  660. sblkp--;
  661. if (sblkp >= sack_blocks &&
  662. SEQ_LT(tp->snd_fack, sblkp->end)) {
  663. delivered_data += sblkp->end - tp->snd_fack;
  664. tp->snd_fack = sblkp->end;
  665. /*
  666. * While the Scoreboard didn't change in
  667. * size, we only ended up here because
  668. * some SACK data had to be dismissed.
  669. */
  670. sack_changed = SACK_NEWLOSS;
  671. }
  672. }
  673. }
  674. } else if (SEQ_LT(tp->snd_fack, sblkp->end)) {
  675. /* fack is advanced. */
  676. delivered_data += sblkp->end - tp->snd_fack;
  677. tp->snd_fack = sblkp->end;
  678. sack_changed = SACK_CHANGE;
  679. }
  680. cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */
  681. loss_hiack = tp->snd_fack;
  682. /*
  683. * Since the incoming sack blocks are sorted, we can process them
  684. * making one sweep of the scoreboard.
  685. */
  686. while (cur != NULL) {
  687. if (!(sblkp >= sack_blocks)) {
  688. if (((loss_sblks >= tcprexmtthresh) ||
  689. (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
  690. break;
  691. loss_thresh += loss_hiack - cur->end;
  692. loss_hiack = cur->start;
  693. loss_sblks++;
  694. if (!((loss_sblks >= tcprexmtthresh) ||
  695. (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg))) {
  696. notlost_bytes += cur->end - cur->start;
  697. } else {
  698. break;
  699. }
  700. cur = TAILQ_PREV(cur, sackhole_head, scblink);
  701. continue;
  702. }
  703. if (SEQ_GEQ(sblkp->start, cur->end)) {
  704. /*
  705. * SACKs data beyond the current hole. Go to the
  706. * previous sack block.
  707. */
  708. sblkp--;
  709. continue;
  710. }
  711. if (SEQ_LEQ(sblkp->end, cur->start)) {
  712. /*
  713. * SACKs data before the current hole. Go to the
  714. * previous hole.
  715. */
  716. loss_thresh += loss_hiack - cur->end;
  717. loss_hiack = cur->start;
  718. loss_sblks++;
  719. if (!((loss_sblks >= tcprexmtthresh) ||
  720. (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
  721. notlost_bytes += cur->end - cur->start;
  722. cur = TAILQ_PREV(cur, sackhole_head, scblink);
  723. continue;
  724. }
  725. tp->sackhint.sack_bytes_rexmit -=
  726. (SEQ_MIN(cur->rxmit, cur->end) - cur->start);
  727. KASSERT(tp->sackhint.sack_bytes_rexmit >= 0,
  728. ("sackhint bytes rtx >= 0"));
  729. sack_changed = SACK_CHANGE;
  730. if (SEQ_LEQ(sblkp->start, cur->start)) {
  731. /* Data acks at least the beginning of hole. */
  732. if (SEQ_GEQ(sblkp->end, cur->end)) {
  733. /* Acks entire hole, so delete hole. */
  734. delivered_data += (cur->end - cur->start);
  735. temp = cur;
  736. cur = TAILQ_PREV(cur, sackhole_head, scblink);
  737. tp->sackhint.hole_bytes -= temp->end - temp->start;
  738. tcp_sackhole_remove(tp, temp);
  739. /*
  740. * The sack block may ack all or part of the
  741. * next hole too, so continue onto the next
  742. * hole.
  743. */
  744. continue;
  745. } else {
  746. /* Move start of hole forward. */
  747. delivered_data += (sblkp->end - cur->start);
  748. tp->sackhint.hole_bytes -= sblkp->end - cur->start;
  749. cur->start = sblkp->end;
  750. cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
  751. }
  752. } else {
  753. /* Data acks at least the end of hole. */
  754. if (SEQ_GEQ(sblkp->end, cur->end)) {
  755. /* Move end of hole backward. */
  756. delivered_data += (cur->end - sblkp->start);
  757. tp->sackhint.hole_bytes -= cur->end - sblkp->start;
  758. cur->end = sblkp->start;
  759. cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
  760. if ((tp->t_flags & TF_LRD) && SEQ_GEQ(cur->rxmit, cur->end))
  761. cur->rxmit = tp->snd_recover;
  762. } else {
  763. /*
  764. * ACKs some data in middle of a hole; need
  765. * to split current hole
  766. */
  767. temp = tcp_sackhole_insert(tp, sblkp->end,
  768. cur->end, cur);
  769. sack_changed = SACK_NEWLOSS;
  770. if (temp != NULL) {
  771. if (SEQ_GT(cur->rxmit, temp->rxmit)) {
  772. temp->rxmit = cur->rxmit;
  773. tp->sackhint.sack_bytes_rexmit +=
  774. (SEQ_MIN(temp->rxmit,
  775. temp->end) - temp->start);
  776. }
  777. tp->sackhint.hole_bytes -= sblkp->end - sblkp->start;
  778. loss_thresh += loss_hiack - temp->end;
  779. loss_hiack = temp->start;
  780. loss_sblks++;
  781. if (!((loss_sblks >= tcprexmtthresh) ||
  782. (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
  783. notlost_bytes += temp->end - temp->start;
  784. cur->end = sblkp->start;
  785. cur->rxmit = SEQ_MIN(cur->rxmit,
  786. cur->end);
  787. if ((tp->t_flags & TF_LRD) && SEQ_GEQ(cur->rxmit, cur->end))
  788. cur->rxmit = tp->snd_recover;
  789. delivered_data += (sblkp->end - sblkp->start);
  790. }
  791. }
  792. }
  793. tp->sackhint.sack_bytes_rexmit +=
  794. (SEQ_MIN(cur->rxmit, cur->end) - cur->start);
  795. /*
  796. * Testing sblkp->start against cur->start tells us whether
  797. * we're done with the sack block or the sack hole.
  798. * Accordingly, we advance one or the other.
  799. */
  800. if (SEQ_LEQ(sblkp->start, cur->start)) {
  801. loss_thresh += loss_hiack - cur->end;
  802. loss_hiack = cur->start;
  803. loss_sblks++;
  804. if (!((loss_sblks >= tcprexmtthresh) ||
  805. (loss_thresh > (tcprexmtthresh-1)*tp->t_maxseg)))
  806. notlost_bytes += cur->end - cur->start;
  807. cur = TAILQ_PREV(cur, sackhole_head, scblink);
  808. } else {
  809. sblkp--;
  810. }
  811. }
  812. KASSERT(!(TAILQ_EMPTY(&tp->snd_holes) && (tp->sackhint.hole_bytes != 0)),
  813. ("SACK scoreboard empty, but accounting non-zero\n"));
  814. KASSERT(notlost_bytes <= tp->sackhint.hole_bytes,
  815. ("SACK: more bytes marked notlost than in scoreboard holes"));
  816. if (!(to->to_flags & TOF_SACK))
  817. /*
  818. * If this ACK did not contain any
  819. * SACK blocks, any only moved the
  820. * left edge right, it is a pure
  821. * cumulative ACK. Do not count
  822. * DupAck for this. Also required
  823. * for RFC6675 rescue retransmission.
  824. */
  825. sack_changed = SACK_NOCHANGE;
  826. tp->sackhint.delivered_data = delivered_data;
  827. tp->sackhint.sacked_bytes += delivered_data - left_edge_delta;
  828. tp->sackhint.lost_bytes = tp->sackhint.hole_bytes - notlost_bytes;
  829. KASSERT((delivered_data >= 0), ("delivered_data < 0"));
  830. KASSERT((tp->sackhint.sacked_bytes >= 0), ("sacked_bytes < 0"));
  831. return (sack_changed);
  832. }
  833. /*
  834. * Free all SACK holes to clear the scoreboard.
  835. */
  836. void
  837. tcp_free_sackholes(struct tcpcb *tp)
  838. {
  839. struct sackhole *q;
  840. INP_WLOCK_ASSERT(tptoinpcb(tp));
  841. while ((q = TAILQ_FIRST(&tp->snd_holes)) != NULL)
  842. tcp_sackhole_remove(tp, q);
  843. tp->sackhint.sack_bytes_rexmit = 0;
  844. tp->sackhint.delivered_data = 0;
  845. tp->sackhint.sacked_bytes = 0;
  846. tp->sackhint.hole_bytes = 0;
  847. tp->sackhint.lost_bytes = 0;
  848. KASSERT(tp->snd_numholes == 0, ("tp->snd_numholes == 0"));
  849. KASSERT(tp->sackhint.nexthole == NULL,
  850. ("tp->sackhint.nexthole == NULL"));
  851. }
  852. /*
  853. * Resend all the currently existing SACK holes of
  854. * the scoreboard. This is in line with the Errata to
  855. * RFC 2018, which allows the use of SACK data past
  856. * an RTO to good effect typically.
  857. */
  858. void
  859. tcp_resend_sackholes(struct tcpcb *tp)
  860. {
  861. struct sackhole *p;
  862. INP_WLOCK_ASSERT(tptoinpcb(tp));
  863. TAILQ_FOREACH(p, &tp->snd_holes, scblink) {
  864. p->rxmit = p->start;
  865. }
  866. tp->sackhint.nexthole = TAILQ_FIRST(&tp->snd_holes);
  867. tp->sackhint.sack_bytes_rexmit = 0;
  868. }
  869. /*
  870. * Partial ack handling within a sack recovery episode. Keeping this very
  871. * simple for now. When a partial ack is received, force snd_cwnd to a value
  872. * that will allow the sender to transmit no more than 2 segments. If
  873. * necessary, a better scheme can be adopted at a later point, but for now,
  874. * the goal is to prevent the sender from bursting a large amount of data in
  875. * the midst of sack recovery.
  876. */
  877. void
  878. tcp_sack_partialack(struct tcpcb *tp, struct tcphdr *th, u_int *maxsegp)
  879. {
  880. struct sackhole *temp;
  881. int num_segs = 1;
  882. u_int maxseg;
  883. INP_WLOCK_ASSERT(tptoinpcb(tp));
  884. if (*maxsegp == 0) {
  885. *maxsegp = tcp_maxseg(tp);
  886. }
  887. maxseg = *maxsegp;
  888. tcp_timer_activate(tp, TT_REXMT, 0);
  889. tp->t_rtttime = 0;
  890. /* Send one or 2 segments based on how much new data was acked. */
  891. if ((BYTES_THIS_ACK(tp, th) / maxseg) >= 2)
  892. num_segs = 2;
  893. if (V_tcp_do_newsack) {
  894. tp->snd_cwnd = imax(tp->snd_nxt - th->th_ack +
  895. tp->sackhint.sack_bytes_rexmit -
  896. tp->sackhint.sacked_bytes -
  897. tp->sackhint.lost_bytes, maxseg) +
  898. num_segs * maxseg;
  899. } else {
  900. tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit +
  901. imax(0, tp->snd_nxt - tp->snd_recover) +
  902. num_segs * maxseg);
  903. }
  904. if (tp->snd_cwnd > tp->snd_ssthresh)
  905. tp->snd_cwnd = tp->snd_ssthresh;
  906. tp->t_flags |= TF_ACKNOW;
  907. /*
  908. * RFC6675 rescue retransmission
  909. * Add a hole between th_ack (snd_una is not yet set) and snd_max,
  910. * if this was a pure cumulative ACK and no data was send beyond
  911. * recovery point. Since the data in the socket has not been freed
  912. * at this point, we check if the scoreboard is empty, and the ACK
  913. * delivered some new data, indicating a full ACK. Also, if the
  914. * recovery point is still at snd_max, we are probably application
  915. * limited. However, this inference might not always be true. The
  916. * rescue retransmission may rarely be slightly premature
  917. * compared to RFC6675.
  918. * The corresponding ACK+SACK will cause any further outstanding
  919. * segments to be retransmitted. This addresses a corner case, when
  920. * the trailing packets of a window are lost and no further data
  921. * is available for sending.
  922. */
  923. if ((V_tcp_do_newsack) &&
  924. SEQ_LT(th->th_ack, tp->snd_recover) &&
  925. TAILQ_EMPTY(&tp->snd_holes) &&
  926. (tp->sackhint.delivered_data > 0)) {
  927. /*
  928. * Exclude FIN sequence space in
  929. * the hole for the rescue retransmission,
  930. * and also don't create a hole, if only
  931. * the ACK for a FIN is outstanding.
  932. */
  933. tcp_seq highdata = tp->snd_max;
  934. if (tp->t_flags & TF_SENTFIN)
  935. highdata--;
  936. highdata = SEQ_MIN(highdata, tp->snd_recover);
  937. if (SEQ_LT(th->th_ack, highdata)) {
  938. tp->snd_fack = th->th_ack;
  939. if ((temp = tcp_sackhole_insert(tp, SEQ_MAX(th->th_ack,
  940. highdata - maxseg), highdata, NULL)) != NULL) {
  941. tp->sackhint.hole_bytes +=
  942. temp->end - temp->start;
  943. }
  944. }
  945. }
  946. (void) tcp_output(tp);
  947. }
  948. /*
  949. * Returns the next hole to retransmit and the number of retransmitted bytes
  950. * from the scoreboard. We store both the next hole and the number of
  951. * retransmitted bytes as hints (and recompute these on the fly upon SACK/ACK
  952. * reception). This avoids scoreboard traversals completely.
  953. *
  954. * The loop here will traverse *at most* one link. Here's the argument. For
  955. * the loop to traverse more than 1 link before finding the next hole to
  956. * retransmit, we would need to have at least 1 node following the current
  957. * hint with (rxmit == end). But, for all holes following the current hint,
  958. * (start == rxmit), since we have not yet retransmitted from them.
  959. * Therefore, in order to traverse more 1 link in the loop below, we need to
  960. * have at least one node following the current hint with (start == rxmit ==
  961. * end). But that can't happen, (start == end) means that all the data in
  962. * that hole has been sacked, in which case, the hole would have been removed
  963. * from the scoreboard.
  964. */
  965. struct sackhole *
  966. tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt)
  967. {
  968. struct sackhole *hole = NULL;
  969. INP_WLOCK_ASSERT(tptoinpcb(tp));
  970. *sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit;
  971. hole = tp->sackhint.nexthole;
  972. if (hole == NULL)
  973. return (hole);
  974. if (SEQ_GEQ(hole->rxmit, hole->end)) {
  975. for (;;) {
  976. hole = TAILQ_NEXT(hole, scblink);
  977. if (hole == NULL)
  978. return (hole);
  979. if (SEQ_LT(hole->rxmit, hole->end)) {
  980. tp->sackhint.nexthole = hole;
  981. break;
  982. }
  983. }
  984. }
  985. KASSERT(SEQ_LT(hole->start, hole->end), ("%s: hole.start >= hole.end", __func__));
  986. if (!(V_tcp_do_newsack)) {
  987. KASSERT(SEQ_LT(hole->start, tp->snd_fack), ("%s: hole.start >= snd.fack", __func__));
  988. KASSERT(SEQ_LT(hole->end, tp->snd_fack), ("%s: hole.end >= snd.fack", __func__));
  989. KASSERT(SEQ_LT(hole->rxmit, tp->snd_fack), ("%s: hole.rxmit >= snd.fack", __func__));
  990. if (SEQ_GEQ(hole->start, hole->end) ||
  991. SEQ_GEQ(hole->start, tp->snd_fack) ||
  992. SEQ_GEQ(hole->end, tp->snd_fack) ||
  993. SEQ_GEQ(hole->rxmit, tp->snd_fack)) {
  994. log(LOG_CRIT,"tcp: invalid SACK hole (%u-%u,%u) vs fwd ack %u, ignoring.\n",
  995. hole->start, hole->end, hole->rxmit, tp->snd_fack);
  996. return (NULL);
  997. }
  998. }
  999. return (hole);
  1000. }
  1001. /*
  1002. * After a timeout, the SACK list may be rebuilt. This SACK information
  1003. * should be used to avoid retransmitting SACKed data. This function
  1004. * traverses the SACK list to see if snd_nxt should be moved forward.
  1005. */
  1006. void
  1007. tcp_sack_adjust(struct tcpcb *tp)
  1008. {
  1009. struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes);
  1010. INP_WLOCK_ASSERT(tptoinpcb(tp));
  1011. if (cur == NULL) {
  1012. /* No holes */
  1013. return;
  1014. }
  1015. if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) {
  1016. /* We're already beyond any SACKed blocks */
  1017. return;
  1018. }
  1019. /*-
  1020. * Two cases for which we want to advance snd_nxt:
  1021. * i) snd_nxt lies between end of one hole and beginning of another
  1022. * ii) snd_nxt lies between end of last hole and snd_fack
  1023. */
  1024. while ((p = TAILQ_NEXT(cur, scblink)) != NULL) {
  1025. if (SEQ_LT(tp->snd_nxt, cur->end)) {
  1026. return;
  1027. }
  1028. if (SEQ_GEQ(tp->snd_nxt, p->start)) {
  1029. cur = p;
  1030. } else {
  1031. tp->snd_nxt = p->start;
  1032. return;
  1033. }
  1034. }
  1035. if (SEQ_LT(tp->snd_nxt, cur->end)) {
  1036. return;
  1037. }
  1038. tp->snd_nxt = tp->snd_fack;
  1039. }
  1040. /*
  1041. * Lost Retransmission Detection
  1042. * Check is FACK is beyond the rexmit of the leftmost hole.
  1043. * If yes, we restart sending from still existing holes,
  1044. * and adjust cwnd via the congestion control module.
  1045. */
  1046. void
  1047. tcp_sack_lost_retransmission(struct tcpcb *tp, struct tcphdr *th)
  1048. {
  1049. struct sackhole *temp;
  1050. if (IN_RECOVERY(tp->t_flags) &&
  1051. SEQ_GT(tp->snd_fack, tp->snd_recover) &&
  1052. ((temp = TAILQ_FIRST(&tp->snd_holes)) != NULL) &&
  1053. SEQ_GEQ(temp->rxmit, temp->end) &&
  1054. SEQ_GEQ(tp->snd_fack, temp->rxmit)) {
  1055. TCPSTAT_INC(tcps_sack_lostrexmt);
  1056. /*
  1057. * Start retransmissions from the first hole, and
  1058. * subsequently all other remaining holes, including
  1059. * those, which had been sent completely before.
  1060. */
  1061. tp->sackhint.nexthole = temp;
  1062. TAILQ_FOREACH(temp, &tp->snd_holes, scblink) {
  1063. if (SEQ_GEQ(tp->snd_fack, temp->rxmit) &&
  1064. SEQ_GEQ(temp->rxmit, temp->end))
  1065. temp->rxmit = temp->start;
  1066. }
  1067. /*
  1068. * Remember the old ssthresh, to deduct the beta factor used
  1069. * by the CC module. Finally, set cwnd to ssthresh just
  1070. * prior to invoking another cwnd reduction by the CC
  1071. * module, to not shrink it excessively.
  1072. */
  1073. tp->snd_cwnd = tp->snd_ssthresh;
  1074. /*
  1075. * Formally exit recovery, and let the CC module adjust
  1076. * ssthresh as intended.
  1077. */
  1078. EXIT_RECOVERY(tp->t_flags);
  1079. cc_cong_signal(tp, th, CC_NDUPACK);
  1080. /*
  1081. * For PRR, adjust recover_fs as if this new reduction
  1082. * initialized this variable.
  1083. * cwnd will be adjusted by SACK or PRR processing
  1084. * subsequently, only set it to a safe value here.
  1085. */
  1086. tp->snd_cwnd = tcp_maxseg(tp);
  1087. tp->sackhint.recover_fs = (tp->snd_max - tp->snd_una) -
  1088. tp->sackhint.recover_fs;
  1089. }
  1090. }