rss_config.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555
  1. /*-
  2. * Copyright (c) 2010-2011 Juniper Networks, Inc.
  3. * All rights reserved.
  4. *
  5. * This software was developed by Robert N. M. Watson under contract
  6. * to Juniper Networks, Inc.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. * 1. Redistributions of source code must retain the above copyright
  12. * notice, this list of conditions and the following disclaimer.
  13. * 2. Redistributions in binary form must reproduce the above copyright
  14. * notice, this list of conditions and the following disclaimer in the
  15. * documentation and/or other materials provided with the distribution.
  16. *
  17. * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  18. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  21. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  22. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  23. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  24. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  25. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  26. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  27. * SUCH DAMAGE.
  28. */
  29. #include <sys/cdefs.h>
  30. __FBSDID("$FreeBSD$");
  31. #include "opt_inet6.h"
  32. #include "opt_pcbgroup.h"
  33. #ifndef PCBGROUP
  34. #error "options RSS depends on options PCBGROUP"
  35. #endif
  36. #include <sys/param.h>
  37. #include <sys/mbuf.h>
  38. #include <sys/socket.h>
  39. #include <sys/priv.h>
  40. #include <sys/kernel.h>
  41. #include <sys/smp.h>
  42. #include <sys/sysctl.h>
  43. #include <sys/sbuf.h>
  44. #include <net/if.h>
  45. #include <net/if_var.h>
  46. #include <net/netisr.h>
  47. #include <net/rss_config.h>
  48. #include <net/toeplitz.h>
  49. /*-
  50. * Operating system parts of receiver-side scaling (RSS), which allows
  51. * network cards to direct flows to particular receive queues based on hashes
  52. * of header tuples. This implementation aligns RSS buckets with connection
  53. * groups at the TCP/IP layer, so each bucket is associated with exactly one
  54. * group. As a result, the group lookup structures (and lock) should have an
  55. * effective affinity with exactly one CPU.
  56. *
  57. * Network device drivers needing to configure RSS will query this framework
  58. * for parameters, such as the current RSS key, hashing policies, number of
  59. * bits, and indirection table mapping hashes to buckets and CPUs. They may
  60. * provide their own supplementary information, such as queue<->CPU bindings.
  61. * It is the responsibility of the network device driver to inject packets
  62. * into the stack on as close to the right CPU as possible, if playing by RSS
  63. * rules.
  64. *
  65. * TODO:
  66. *
  67. * - Synchronization for rss_key and other future-configurable parameters.
  68. * - Event handler drivers can register to pick up RSS configuration changes.
  69. * - Should we allow rss_basecpu to be configured?
  70. * - Randomize key on boot.
  71. * - IPv6 support.
  72. * - Statistics on how often there's a misalignment between hardware
  73. * placement and pcbgroup expectations.
  74. */
  75. SYSCTL_DECL(_net_inet);
  76. SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
  77. "Receive-side steering");
  78. /*
  79. * Toeplitz is the only required hash function in the RSS spec, so use it by
  80. * default.
  81. */
  82. static u_int rss_hashalgo = RSS_HASH_TOEPLITZ;
  83. SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0,
  84. "RSS hash algorithm");
  85. /*
  86. * Size of the indirection table; at most 128 entries per the RSS spec. We
  87. * size it to at least 2 times the number of CPUs by default to allow useful
  88. * rebalancing. If not set explicitly with a loader tunable, we tune based
  89. * on the number of CPUs present.
  90. *
  91. * XXXRW: buckets might be better to use for the tunable than bits.
  92. */
  93. static u_int rss_bits;
  94. SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0,
  95. "RSS bits");
  96. static u_int rss_mask;
  97. SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0,
  98. "RSS mask");
  99. static const u_int rss_maxbits = RSS_MAXBITS;
  100. SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD,
  101. __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits");
  102. /*
  103. * RSS's own count of the number of CPUs it could be using for processing.
  104. * Bounded to 64 by RSS constants.
  105. */
  106. static u_int rss_ncpus;
  107. SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0,
  108. "Number of CPUs available to RSS");
  109. #define RSS_MAXCPUS (1 << (RSS_MAXBITS - 1))
  110. static const u_int rss_maxcpus = RSS_MAXCPUS;
  111. SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD,
  112. __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used");
  113. /*
  114. * Variable exists just for reporting rss_bits in a user-friendly way.
  115. */
  116. static u_int rss_buckets;
  117. SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0,
  118. "RSS buckets");
  119. /*
  120. * Base CPU number; devices will add this to all CPU numbers returned by the
  121. * RSS indirection table. Currently unmodifable in FreeBSD.
  122. */
  123. static const u_int rss_basecpu;
  124. SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD,
  125. __DECONST(int *, &rss_basecpu), 0, "RSS base CPU");
  126. /*
  127. * Print verbose debugging messages.
  128. * 0 - disable
  129. * non-zero - enable
  130. */
  131. int rss_debug = 0;
  132. SYSCTL_INT(_net_inet_rss, OID_AUTO, debug, CTLFLAG_RWTUN, &rss_debug, 0,
  133. "RSS debug level");
  134. /*
  135. * RSS secret key, intended to prevent attacks on load-balancing. Its
  136. * effectiveness may be limited by algorithm choice and available entropy
  137. * during the boot.
  138. *
  139. * XXXRW: And that we don't randomize it yet!
  140. *
  141. * This is the default Microsoft RSS specification key which is also
  142. * the Chelsio T5 firmware default key.
  143. */
  144. static uint8_t rss_key[RSS_KEYSIZE] = {
  145. 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
  146. 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
  147. 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
  148. 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
  149. 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
  150. };
  151. /*
  152. * RSS hash->CPU table, which maps hashed packet headers to particular CPUs.
  153. * Drivers may supplement this table with a separate CPU<->queue table when
  154. * programming devices.
  155. */
  156. struct rss_table_entry {
  157. uint8_t rte_cpu; /* CPU affinity of bucket. */
  158. };
  159. static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN];
  160. static void
  161. rss_init(__unused void *arg)
  162. {
  163. u_int i;
  164. u_int cpuid;
  165. /*
  166. * Validate tunables, coerce to sensible values.
  167. */
  168. switch (rss_hashalgo) {
  169. case RSS_HASH_TOEPLITZ:
  170. case RSS_HASH_NAIVE:
  171. break;
  172. default:
  173. RSS_DEBUG("invalid RSS hashalgo %u, coercing to %u\n",
  174. rss_hashalgo, RSS_HASH_TOEPLITZ);
  175. rss_hashalgo = RSS_HASH_TOEPLITZ;
  176. }
  177. /*
  178. * Count available CPUs.
  179. *
  180. * XXXRW: Note incorrect assumptions regarding contiguity of this set
  181. * elsewhere.
  182. */
  183. rss_ncpus = 0;
  184. for (i = 0; i <= mp_maxid; i++) {
  185. if (CPU_ABSENT(i))
  186. continue;
  187. rss_ncpus++;
  188. }
  189. if (rss_ncpus > RSS_MAXCPUS)
  190. rss_ncpus = RSS_MAXCPUS;
  191. /*
  192. * Tune RSS table entries to be no less than 2x the number of CPUs
  193. * -- unless we're running uniprocessor, in which case there's not
  194. * much point in having buckets to rearrange for load-balancing!
  195. */
  196. if (rss_ncpus > 1) {
  197. if (rss_bits == 0)
  198. rss_bits = fls(rss_ncpus - 1) + 1;
  199. /*
  200. * Microsoft limits RSS table entries to 128, so apply that
  201. * limit to both auto-detected CPU counts and user-configured
  202. * ones.
  203. */
  204. if (rss_bits == 0 || rss_bits > RSS_MAXBITS) {
  205. RSS_DEBUG("RSS bits %u not valid, coercing to %u\n",
  206. rss_bits, RSS_MAXBITS);
  207. rss_bits = RSS_MAXBITS;
  208. }
  209. /*
  210. * Figure out how many buckets to use; warn if less than the
  211. * number of configured CPUs, although this is not a fatal
  212. * problem.
  213. */
  214. rss_buckets = (1 << rss_bits);
  215. if (rss_buckets < rss_ncpus)
  216. RSS_DEBUG("WARNING: rss_buckets (%u) less than "
  217. "rss_ncpus (%u)\n", rss_buckets, rss_ncpus);
  218. rss_mask = rss_buckets - 1;
  219. } else {
  220. rss_bits = 0;
  221. rss_buckets = 1;
  222. rss_mask = 0;
  223. }
  224. /*
  225. * Set up initial CPU assignments: round-robin by default.
  226. */
  227. cpuid = CPU_FIRST();
  228. for (i = 0; i < rss_buckets; i++) {
  229. rss_table[i].rte_cpu = cpuid;
  230. cpuid = CPU_NEXT(cpuid);
  231. }
  232. /*
  233. * Randomize rrs_key.
  234. *
  235. * XXXRW: Not yet. If nothing else, will require an rss_isbadkey()
  236. * loop to check for "bad" RSS keys.
  237. */
  238. }
  239. SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL);
  240. static uint32_t
  241. rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen,
  242. const uint8_t *data)
  243. {
  244. uint32_t v;
  245. u_int i;
  246. v = 0;
  247. for (i = 0; i < keylen; i++)
  248. v += key[i];
  249. for (i = 0; i < datalen; i++)
  250. v += data[i];
  251. return (v);
  252. }
  253. uint32_t
  254. rss_hash(u_int datalen, const uint8_t *data)
  255. {
  256. switch (rss_hashalgo) {
  257. case RSS_HASH_TOEPLITZ:
  258. return (toeplitz_hash(sizeof(rss_key), rss_key, datalen,
  259. data));
  260. case RSS_HASH_NAIVE:
  261. return (rss_naive_hash(sizeof(rss_key), rss_key, datalen,
  262. data));
  263. default:
  264. panic("%s: unsupported/unknown hashalgo %d", __func__,
  265. rss_hashalgo);
  266. }
  267. }
  268. /*
  269. * Query the number of RSS bits in use.
  270. */
  271. u_int
  272. rss_getbits(void)
  273. {
  274. return (rss_bits);
  275. }
  276. /*
  277. * Query the RSS bucket associated with an RSS hash.
  278. */
  279. u_int
  280. rss_getbucket(u_int hash)
  281. {
  282. return (hash & rss_mask);
  283. }
  284. /*
  285. * Query the RSS layer bucket associated with the given
  286. * entry in the RSS hash space.
  287. *
  288. * The RSS indirection table is 0 .. rss_buckets-1,
  289. * covering the low 'rss_bits' of the total 128 slot
  290. * RSS indirection table. So just mask off rss_bits and
  291. * return that.
  292. *
  293. * NIC drivers can then iterate over the 128 slot RSS
  294. * indirection table and fetch which RSS bucket to
  295. * map it to. This will typically be a CPU queue
  296. */
  297. u_int
  298. rss_get_indirection_to_bucket(u_int index)
  299. {
  300. return (index & rss_mask);
  301. }
  302. /*
  303. * Query the RSS CPU associated with an RSS bucket.
  304. */
  305. u_int
  306. rss_getcpu(u_int bucket)
  307. {
  308. return (rss_table[bucket].rte_cpu);
  309. }
  310. /*
  311. * netisr CPU affinity lookup given just the hash and hashtype.
  312. */
  313. u_int
  314. rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type)
  315. {
  316. switch (hash_type) {
  317. case M_HASHTYPE_RSS_IPV4:
  318. case M_HASHTYPE_RSS_TCP_IPV4:
  319. case M_HASHTYPE_RSS_UDP_IPV4:
  320. case M_HASHTYPE_RSS_IPV6:
  321. case M_HASHTYPE_RSS_TCP_IPV6:
  322. case M_HASHTYPE_RSS_UDP_IPV6:
  323. return (rss_getcpu(rss_getbucket(hash_val)));
  324. default:
  325. return (NETISR_CPUID_NONE);
  326. }
  327. }
  328. /*
  329. * Query the RSS bucket associated with the given hash value and
  330. * type.
  331. */
  332. int
  333. rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id)
  334. {
  335. switch (hash_type) {
  336. case M_HASHTYPE_RSS_IPV4:
  337. case M_HASHTYPE_RSS_TCP_IPV4:
  338. case M_HASHTYPE_RSS_UDP_IPV4:
  339. case M_HASHTYPE_RSS_IPV6:
  340. case M_HASHTYPE_RSS_TCP_IPV6:
  341. case M_HASHTYPE_RSS_UDP_IPV6:
  342. *bucket_id = rss_getbucket(hash_val);
  343. return (0);
  344. default:
  345. return (-1);
  346. }
  347. }
  348. /*
  349. * netisr CPU affinity lookup routine for use by protocols.
  350. */
  351. struct mbuf *
  352. rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
  353. {
  354. M_ASSERTPKTHDR(m);
  355. *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
  356. return (m);
  357. }
  358. int
  359. rss_m2bucket(struct mbuf *m, uint32_t *bucket_id)
  360. {
  361. M_ASSERTPKTHDR(m);
  362. return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
  363. bucket_id));
  364. }
  365. /*
  366. * Query the RSS hash algorithm.
  367. */
  368. u_int
  369. rss_gethashalgo(void)
  370. {
  371. return (rss_hashalgo);
  372. }
  373. /*
  374. * Query the current RSS key; likely to be used by device drivers when
  375. * configuring hardware RSS. Caller must pass an array of size RSS_KEYSIZE.
  376. *
  377. * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing?
  378. */
  379. void
  380. rss_getkey(uint8_t *key)
  381. {
  382. bcopy(rss_key, key, sizeof(rss_key));
  383. }
  384. /*
  385. * Query the number of buckets; this may be used by both network device
  386. * drivers, which will need to populate hardware shadows of the software
  387. * indirection table, and the network stack itself (such as when deciding how
  388. * many connection groups to allocate).
  389. */
  390. u_int
  391. rss_getnumbuckets(void)
  392. {
  393. return (rss_buckets);
  394. }
  395. /*
  396. * Query the number of CPUs in use by RSS; may be useful to device drivers
  397. * trying to figure out how to map a larger number of CPUs into a smaller
  398. * number of receive queues.
  399. */
  400. u_int
  401. rss_getnumcpus(void)
  402. {
  403. return (rss_ncpus);
  404. }
  405. /*
  406. * Return the supported RSS hash configuration.
  407. *
  408. * NICs should query this to determine what to configure in their redirection
  409. * matching table.
  410. */
  411. inline u_int
  412. rss_gethashconfig(void)
  413. {
  414. /* Return 4-tuple for TCP; 2-tuple for others */
  415. /*
  416. * UDP may fragment more often than TCP and thus we'll end up with
  417. * NICs returning 2-tuple fragments.
  418. * udp_init() and udplite_init() both currently initialise things
  419. * as 2-tuple.
  420. * So for now disable UDP 4-tuple hashing until all of the other
  421. * pieces are in place.
  422. */
  423. return (
  424. RSS_HASHTYPE_RSS_IPV4
  425. | RSS_HASHTYPE_RSS_TCP_IPV4
  426. | RSS_HASHTYPE_RSS_IPV6
  427. | RSS_HASHTYPE_RSS_TCP_IPV6
  428. | RSS_HASHTYPE_RSS_IPV6_EX
  429. | RSS_HASHTYPE_RSS_TCP_IPV6_EX
  430. #if 0
  431. | RSS_HASHTYPE_RSS_UDP_IPV4
  432. | RSS_HASHTYPE_RSS_UDP_IPV6
  433. | RSS_HASHTYPE_RSS_UDP_IPV6_EX
  434. #endif
  435. );
  436. }
  437. /*
  438. * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want
  439. * it appearing in debugging output unnecessarily.
  440. */
  441. static int
  442. sysctl_rss_key(SYSCTL_HANDLER_ARGS)
  443. {
  444. uint8_t temp_rss_key[RSS_KEYSIZE];
  445. int error;
  446. error = priv_check(req->td, PRIV_NETINET_HASHKEY);
  447. if (error)
  448. return (error);
  449. bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key));
  450. error = sysctl_handle_opaque(oidp, temp_rss_key,
  451. sizeof(temp_rss_key), req);
  452. if (error)
  453. return (error);
  454. if (req->newptr != NULL) {
  455. /* XXXRW: Not yet. */
  456. return (EINVAL);
  457. }
  458. return (0);
  459. }
  460. SYSCTL_PROC(_net_inet_rss, OID_AUTO, key,
  461. CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key,
  462. "", "RSS keying material");
  463. static int
  464. sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS)
  465. {
  466. struct sbuf *sb;
  467. int error;
  468. int i;
  469. error = 0;
  470. error = sysctl_wire_old_buffer(req, 0);
  471. if (error != 0)
  472. return (error);
  473. sb = sbuf_new_for_sysctl(NULL, NULL, 512, req);
  474. if (sb == NULL)
  475. return (ENOMEM);
  476. for (i = 0; i < rss_buckets; i++) {
  477. sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ",
  478. i,
  479. rss_getcpu(i));
  480. }
  481. error = sbuf_finish(sb);
  482. sbuf_delete(sb);
  483. return (error);
  484. }
  485. SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping,
  486. CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
  487. sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping");