ip_output.c 41 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629
  1. /*-
  2. * SPDX-License-Identifier: BSD-3-Clause
  3. *
  4. * Copyright (c) 1982, 1986, 1988, 1990, 1993
  5. * The Regents of the University of California. All rights reserved.
  6. *
  7. * Redistribution and use in source and binary forms, with or without
  8. * modification, are permitted provided that the following conditions
  9. * are met:
  10. * 1. Redistributions of source code must retain the above copyright
  11. * notice, this list of conditions and the following disclaimer.
  12. * 2. Redistributions in binary form must reproduce the above copyright
  13. * notice, this list of conditions and the following disclaimer in the
  14. * documentation and/or other materials provided with the distribution.
  15. * 3. Neither the name of the University nor the names of its contributors
  16. * may be used to endorse or promote products derived from this software
  17. * without specific prior written permission.
  18. *
  19. * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  20. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  21. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  22. * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  23. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  24. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  25. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  26. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  27. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  28. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  29. * SUCH DAMAGE.
  30. *
  31. * @(#)ip_output.c 8.3 (Berkeley) 1/21/94
  32. */
  33. #include <sys/cdefs.h>
  34. __FBSDID("$FreeBSD$");
  35. #include "opt_inet.h"
  36. #include "opt_ipsec.h"
  37. #include "opt_kern_tls.h"
  38. #include "opt_mbuf_stress_test.h"
  39. #include "opt_ratelimit.h"
  40. #include "opt_route.h"
  41. #include "opt_rss.h"
  42. #include "opt_sctp.h"
  43. #include <sys/param.h>
  44. #include <sys/systm.h>
  45. #include <sys/kernel.h>
  46. #include <sys/ktls.h>
  47. #include <sys/lock.h>
  48. #include <sys/malloc.h>
  49. #include <sys/mbuf.h>
  50. #include <sys/priv.h>
  51. #include <sys/proc.h>
  52. #include <sys/protosw.h>
  53. #include <sys/rmlock.h>
  54. #include <sys/sdt.h>
  55. #include <sys/socket.h>
  56. #include <sys/socketvar.h>
  57. #include <sys/sysctl.h>
  58. #include <sys/ucred.h>
  59. #include <net/if.h>
  60. #include <net/if_var.h>
  61. #include <net/if_vlan_var.h>
  62. #include <net/if_llatbl.h>
  63. #include <net/ethernet.h>
  64. #include <net/netisr.h>
  65. #include <net/pfil.h>
  66. #include <net/route.h>
  67. #include <net/route/nhop.h>
  68. #include <net/rss_config.h>
  69. #include <net/vnet.h>
  70. #include <netinet/in.h>
  71. #include <netinet/in_fib.h>
  72. #include <netinet/in_kdtrace.h>
  73. #include <netinet/in_systm.h>
  74. #include <netinet/ip.h>
  75. #include <netinet/in_fib.h>
  76. #include <netinet/in_pcb.h>
  77. #include <netinet/in_rss.h>
  78. #include <netinet/in_var.h>
  79. #include <netinet/ip_var.h>
  80. #include <netinet/ip_options.h>
  81. #include <netinet/udp.h>
  82. #include <netinet/udp_var.h>
  83. #if defined(SCTP) || defined(SCTP_SUPPORT)
  84. #include <netinet/sctp.h>
  85. #include <netinet/sctp_crc32.h>
  86. #endif
  87. #include <netipsec/ipsec_support.h>
  88. #include <machine/in_cksum.h>
  89. #include <security/mac/mac_framework.h>
  90. #ifdef MBUF_STRESS_TEST
  91. static int mbuf_frag_size = 0;
  92. SYSCTL_INT(_net_inet_ip, OID_AUTO, mbuf_frag_size, CTLFLAG_RW,
  93. &mbuf_frag_size, 0, "Fragment outgoing mbufs to this size");
  94. #endif
  95. static void ip_mloopback(struct ifnet *, const struct mbuf *, int);
  96. extern int in_mcast_loop;
  97. extern struct protosw inetsw[];
  98. static inline int
  99. ip_output_pfil(struct mbuf **mp, struct ifnet *ifp, int flags,
  100. struct inpcb *inp, struct sockaddr_in *dst, int *fibnum, int *error)
  101. {
  102. struct m_tag *fwd_tag = NULL;
  103. struct mbuf *m;
  104. struct in_addr odst;
  105. struct ip *ip;
  106. int pflags = PFIL_OUT;
  107. if (flags & IP_FORWARDING)
  108. pflags |= PFIL_FWD;
  109. m = *mp;
  110. ip = mtod(m, struct ip *);
  111. /* Run through list of hooks for output packets. */
  112. odst.s_addr = ip->ip_dst.s_addr;
  113. switch (pfil_run_hooks(V_inet_pfil_head, mp, ifp, pflags, inp)) {
  114. case PFIL_DROPPED:
  115. *error = EACCES;
  116. /* FALLTHROUGH */
  117. case PFIL_CONSUMED:
  118. return 1; /* Finished */
  119. case PFIL_PASS:
  120. *error = 0;
  121. }
  122. m = *mp;
  123. ip = mtod(m, struct ip *);
  124. /* See if destination IP address was changed by packet filter. */
  125. if (odst.s_addr != ip->ip_dst.s_addr) {
  126. m->m_flags |= M_SKIP_FIREWALL;
  127. /* If destination is now ourself drop to ip_input(). */
  128. if (in_localip(ip->ip_dst)) {
  129. m->m_flags |= M_FASTFWD_OURS;
  130. if (m->m_pkthdr.rcvif == NULL)
  131. m->m_pkthdr.rcvif = V_loif;
  132. if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
  133. m->m_pkthdr.csum_flags |=
  134. CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
  135. m->m_pkthdr.csum_data = 0xffff;
  136. }
  137. m->m_pkthdr.csum_flags |=
  138. CSUM_IP_CHECKED | CSUM_IP_VALID;
  139. #if defined(SCTP) || defined(SCTP_SUPPORT)
  140. if (m->m_pkthdr.csum_flags & CSUM_SCTP)
  141. m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
  142. #endif
  143. *error = netisr_queue(NETISR_IP, m);
  144. return 1; /* Finished */
  145. }
  146. bzero(dst, sizeof(*dst));
  147. dst->sin_family = AF_INET;
  148. dst->sin_len = sizeof(*dst);
  149. dst->sin_addr = ip->ip_dst;
  150. return -1; /* Reloop */
  151. }
  152. /* See if fib was changed by packet filter. */
  153. if ((*fibnum) != M_GETFIB(m)) {
  154. m->m_flags |= M_SKIP_FIREWALL;
  155. *fibnum = M_GETFIB(m);
  156. return -1; /* Reloop for FIB change */
  157. }
  158. /* See if local, if yes, send it to netisr with IP_FASTFWD_OURS. */
  159. if (m->m_flags & M_FASTFWD_OURS) {
  160. if (m->m_pkthdr.rcvif == NULL)
  161. m->m_pkthdr.rcvif = V_loif;
  162. if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
  163. m->m_pkthdr.csum_flags |=
  164. CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
  165. m->m_pkthdr.csum_data = 0xffff;
  166. }
  167. #if defined(SCTP) || defined(SCTP_SUPPORT)
  168. if (m->m_pkthdr.csum_flags & CSUM_SCTP)
  169. m->m_pkthdr.csum_flags |= CSUM_SCTP_VALID;
  170. #endif
  171. m->m_pkthdr.csum_flags |=
  172. CSUM_IP_CHECKED | CSUM_IP_VALID;
  173. *error = netisr_queue(NETISR_IP, m);
  174. return 1; /* Finished */
  175. }
  176. /* Or forward to some other address? */
  177. if ((m->m_flags & M_IP_NEXTHOP) &&
  178. ((fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL)) != NULL)) {
  179. bcopy((fwd_tag+1), dst, sizeof(struct sockaddr_in));
  180. m->m_flags |= M_SKIP_FIREWALL;
  181. m->m_flags &= ~M_IP_NEXTHOP;
  182. m_tag_delete(m, fwd_tag);
  183. return -1; /* Reloop for CHANGE of dst */
  184. }
  185. return 0;
  186. }
  187. static int
  188. ip_output_send(struct inpcb *inp, struct ifnet *ifp, struct mbuf *m,
  189. const struct sockaddr_in *gw, struct route *ro, bool stamp_tag)
  190. {
  191. #ifdef KERN_TLS
  192. struct ktls_session *tls = NULL;
  193. #endif
  194. struct m_snd_tag *mst;
  195. int error;
  196. MPASS((m->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
  197. mst = NULL;
  198. #ifdef KERN_TLS
  199. /*
  200. * If this is an unencrypted TLS record, save a reference to
  201. * the record. This local reference is used to call
  202. * ktls_output_eagain after the mbuf has been freed (thus
  203. * dropping the mbuf's reference) in if_output.
  204. */
  205. if (m->m_next != NULL && mbuf_has_tls_session(m->m_next)) {
  206. tls = ktls_hold(m->m_next->m_epg_tls);
  207. mst = tls->snd_tag;
  208. /*
  209. * If a TLS session doesn't have a valid tag, it must
  210. * have had an earlier ifp mismatch, so drop this
  211. * packet.
  212. */
  213. if (mst == NULL) {
  214. error = EAGAIN;
  215. goto done;
  216. }
  217. /*
  218. * Always stamp tags that include NIC ktls.
  219. */
  220. stamp_tag = true;
  221. }
  222. #endif
  223. #ifdef RATELIMIT
  224. if (inp != NULL && mst == NULL) {
  225. if ((inp->inp_flags2 & INP_RATE_LIMIT_CHANGED) != 0 ||
  226. (inp->inp_snd_tag != NULL &&
  227. inp->inp_snd_tag->ifp != ifp))
  228. in_pcboutput_txrtlmt(inp, ifp, m);
  229. if (inp->inp_snd_tag != NULL)
  230. mst = inp->inp_snd_tag;
  231. }
  232. #endif
  233. if (stamp_tag && mst != NULL) {
  234. KASSERT(m->m_pkthdr.rcvif == NULL,
  235. ("trying to add a send tag to a forwarded packet"));
  236. if (mst->ifp != ifp) {
  237. error = EAGAIN;
  238. goto done;
  239. }
  240. /* stamp send tag on mbuf */
  241. m->m_pkthdr.snd_tag = m_snd_tag_ref(mst);
  242. m->m_pkthdr.csum_flags |= CSUM_SND_TAG;
  243. }
  244. error = (*ifp->if_output)(ifp, m, (const struct sockaddr *)gw, ro);
  245. done:
  246. /* Check for route change invalidating send tags. */
  247. #ifdef KERN_TLS
  248. if (tls != NULL) {
  249. if (error == EAGAIN)
  250. error = ktls_output_eagain(inp, tls);
  251. ktls_free(tls);
  252. }
  253. #endif
  254. #ifdef RATELIMIT
  255. if (error == EAGAIN)
  256. in_pcboutput_eagain(inp);
  257. #endif
  258. return (error);
  259. }
  260. /* rte<>ro_flags translation */
  261. static inline void
  262. rt_update_ro_flags(struct route *ro)
  263. {
  264. int nh_flags = ro->ro_nh->nh_flags;
  265. ro->ro_flags &= ~ (RT_REJECT|RT_BLACKHOLE|RT_HAS_GW);
  266. ro->ro_flags |= (nh_flags & NHF_REJECT) ? RT_REJECT : 0;
  267. ro->ro_flags |= (nh_flags & NHF_BLACKHOLE) ? RT_BLACKHOLE : 0;
  268. ro->ro_flags |= (nh_flags & NHF_GATEWAY) ? RT_HAS_GW : 0;
  269. }
  270. /*
  271. * IP output. The packet in mbuf chain m contains a skeletal IP
  272. * header (with len, off, ttl, proto, tos, src, dst).
  273. * The mbuf chain containing the packet will be freed.
  274. * The mbuf opt, if present, will not be freed.
  275. * If route ro is present and has ro_rt initialized, route lookup would be
  276. * skipped and ro->ro_rt would be used. If ro is present but ro->ro_rt is NULL,
  277. * then result of route lookup is stored in ro->ro_rt.
  278. *
  279. * In the IP forwarding case, the packet will arrive with options already
  280. * inserted, so must have a NULL opt pointer.
  281. */
  282. int
  283. ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
  284. struct ip_moptions *imo, struct inpcb *inp)
  285. {
  286. struct rm_priotracker in_ifa_tracker;
  287. struct ip *ip;
  288. struct ifnet *ifp = NULL; /* keep compiler happy */
  289. struct mbuf *m0;
  290. int hlen = sizeof (struct ip);
  291. int mtu = 0;
  292. int error = 0;
  293. int vlan_pcp = -1;
  294. struct sockaddr_in *dst, sin;
  295. const struct sockaddr_in *gw;
  296. struct in_ifaddr *ia = NULL;
  297. struct in_addr src;
  298. int isbroadcast;
  299. uint16_t ip_len, ip_off;
  300. uint32_t fibnum;
  301. #if defined(IPSEC) || defined(IPSEC_SUPPORT)
  302. int no_route_but_check_spd = 0;
  303. #endif
  304. M_ASSERTPKTHDR(m);
  305. NET_EPOCH_ASSERT();
  306. if (inp != NULL) {
  307. INP_LOCK_ASSERT(inp);
  308. M_SETFIB(m, inp->inp_inc.inc_fibnum);
  309. if ((flags & IP_NODEFAULTFLOWID) == 0) {
  310. m->m_pkthdr.flowid = inp->inp_flowid;
  311. M_HASHTYPE_SET(m, inp->inp_flowtype);
  312. }
  313. if ((inp->inp_flags2 & INP_2PCP_SET) != 0)
  314. vlan_pcp = (inp->inp_flags2 & INP_2PCP_MASK) >>
  315. INP_2PCP_SHIFT;
  316. #ifdef NUMA
  317. m->m_pkthdr.numa_domain = inp->inp_numa_domain;
  318. #endif
  319. }
  320. if (opt) {
  321. int len = 0;
  322. m = ip_insertoptions(m, opt, &len);
  323. if (len != 0)
  324. hlen = len; /* ip->ip_hl is updated above */
  325. }
  326. ip = mtod(m, struct ip *);
  327. ip_len = ntohs(ip->ip_len);
  328. ip_off = ntohs(ip->ip_off);
  329. if ((flags & (IP_FORWARDING|IP_RAWOUTPUT)) == 0) {
  330. ip->ip_v = IPVERSION;
  331. ip->ip_hl = hlen >> 2;
  332. ip_fillid(ip);
  333. } else {
  334. /* Header already set, fetch hlen from there */
  335. hlen = ip->ip_hl << 2;
  336. }
  337. if ((flags & IP_FORWARDING) == 0)
  338. IPSTAT_INC(ips_localout);
  339. /*
  340. * dst/gw handling:
  341. *
  342. * gw is readonly but can point either to dst OR rt_gateway,
  343. * therefore we need restore gw if we're redoing lookup.
  344. */
  345. fibnum = (inp != NULL) ? inp->inp_inc.inc_fibnum : M_GETFIB(m);
  346. if (ro != NULL)
  347. dst = (struct sockaddr_in *)&ro->ro_dst;
  348. else
  349. dst = &sin;
  350. if (ro == NULL || ro->ro_nh == NULL) {
  351. bzero(dst, sizeof(*dst));
  352. dst->sin_family = AF_INET;
  353. dst->sin_len = sizeof(*dst);
  354. dst->sin_addr = ip->ip_dst;
  355. }
  356. gw = dst;
  357. again:
  358. /*
  359. * Validate route against routing table additions;
  360. * a better/more specific route might have been added.
  361. */
  362. if (inp != NULL && ro != NULL && ro->ro_nh != NULL)
  363. NH_VALIDATE(ro, &inp->inp_rt_cookie, fibnum);
  364. /*
  365. * If there is a cached route,
  366. * check that it is to the same destination
  367. * and is still up. If not, free it and try again.
  368. * The address family should also be checked in case of sharing the
  369. * cache with IPv6.
  370. * Also check whether routing cache needs invalidation.
  371. */
  372. if (ro != NULL && ro->ro_nh != NULL &&
  373. ((!NH_IS_VALID(ro->ro_nh)) || dst->sin_family != AF_INET ||
  374. dst->sin_addr.s_addr != ip->ip_dst.s_addr))
  375. RO_INVALIDATE_CACHE(ro);
  376. ia = NULL;
  377. /*
  378. * If routing to interface only, short circuit routing lookup.
  379. * The use of an all-ones broadcast address implies this; an
  380. * interface is specified by the broadcast address of an interface,
  381. * or the destination address of a ptp interface.
  382. */
  383. if (flags & IP_SENDONES) {
  384. if ((ia = ifatoia(ifa_ifwithbroadaddr(sintosa(dst),
  385. M_GETFIB(m)))) == NULL &&
  386. (ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
  387. M_GETFIB(m)))) == NULL) {
  388. IPSTAT_INC(ips_noroute);
  389. error = ENETUNREACH;
  390. goto bad;
  391. }
  392. ip->ip_dst.s_addr = INADDR_BROADCAST;
  393. dst->sin_addr = ip->ip_dst;
  394. ifp = ia->ia_ifp;
  395. mtu = ifp->if_mtu;
  396. ip->ip_ttl = 1;
  397. isbroadcast = 1;
  398. src = IA_SIN(ia)->sin_addr;
  399. } else if (flags & IP_ROUTETOIF) {
  400. if ((ia = ifatoia(ifa_ifwithdstaddr(sintosa(dst),
  401. M_GETFIB(m)))) == NULL &&
  402. (ia = ifatoia(ifa_ifwithnet(sintosa(dst), 0,
  403. M_GETFIB(m)))) == NULL) {
  404. IPSTAT_INC(ips_noroute);
  405. error = ENETUNREACH;
  406. goto bad;
  407. }
  408. ifp = ia->ia_ifp;
  409. mtu = ifp->if_mtu;
  410. ip->ip_ttl = 1;
  411. isbroadcast = ifp->if_flags & IFF_BROADCAST ?
  412. in_ifaddr_broadcast(dst->sin_addr, ia) : 0;
  413. src = IA_SIN(ia)->sin_addr;
  414. } else if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) &&
  415. imo != NULL && imo->imo_multicast_ifp != NULL) {
  416. /*
  417. * Bypass the normal routing lookup for multicast
  418. * packets if the interface is specified.
  419. */
  420. ifp = imo->imo_multicast_ifp;
  421. mtu = ifp->if_mtu;
  422. IFP_TO_IA(ifp, ia, &in_ifa_tracker);
  423. isbroadcast = 0; /* fool gcc */
  424. /* Interface may have no addresses. */
  425. if (ia != NULL)
  426. src = IA_SIN(ia)->sin_addr;
  427. else
  428. src.s_addr = INADDR_ANY;
  429. } else if (ro != NULL) {
  430. if (ro->ro_nh == NULL) {
  431. /*
  432. * We want to do any cloning requested by the link
  433. * layer, as this is probably required in all cases
  434. * for correct operation (as it is for ARP).
  435. */
  436. uint32_t flowid;
  437. flowid = m->m_pkthdr.flowid;
  438. ro->ro_nh = fib4_lookup(fibnum, dst->sin_addr, 0,
  439. NHR_REF, flowid);
  440. if (ro->ro_nh == NULL || (!NH_IS_VALID(ro->ro_nh))) {
  441. #if defined(IPSEC) || defined(IPSEC_SUPPORT)
  442. /*
  443. * There is no route for this packet, but it is
  444. * possible that a matching SPD entry exists.
  445. */
  446. no_route_but_check_spd = 1;
  447. goto sendit;
  448. #endif
  449. IPSTAT_INC(ips_noroute);
  450. error = EHOSTUNREACH;
  451. goto bad;
  452. }
  453. }
  454. ia = ifatoia(ro->ro_nh->nh_ifa);
  455. ifp = ro->ro_nh->nh_ifp;
  456. counter_u64_add(ro->ro_nh->nh_pksent, 1);
  457. rt_update_ro_flags(ro);
  458. if (ro->ro_nh->nh_flags & NHF_GATEWAY)
  459. gw = &ro->ro_nh->gw4_sa;
  460. if (ro->ro_nh->nh_flags & NHF_HOST)
  461. isbroadcast = (ro->ro_nh->nh_flags & NHF_BROADCAST);
  462. else if (ifp->if_flags & IFF_BROADCAST)
  463. isbroadcast = in_ifaddr_broadcast(gw->sin_addr, ia);
  464. else
  465. isbroadcast = 0;
  466. if (ro->ro_nh->nh_flags & NHF_HOST)
  467. mtu = ro->ro_nh->nh_mtu;
  468. else
  469. mtu = ifp->if_mtu;
  470. src = IA_SIN(ia)->sin_addr;
  471. } else {
  472. struct nhop_object *nh;
  473. nh = fib4_lookup(M_GETFIB(m), ip->ip_dst, 0, NHR_NONE,
  474. m->m_pkthdr.flowid);
  475. if (nh == NULL) {
  476. #if defined(IPSEC) || defined(IPSEC_SUPPORT)
  477. /*
  478. * There is no route for this packet, but it is
  479. * possible that a matching SPD entry exists.
  480. */
  481. no_route_but_check_spd = 1;
  482. goto sendit;
  483. #endif
  484. IPSTAT_INC(ips_noroute);
  485. error = EHOSTUNREACH;
  486. goto bad;
  487. }
  488. ifp = nh->nh_ifp;
  489. mtu = nh->nh_mtu;
  490. /*
  491. * We are rewriting here dst to be gw actually, contradicting
  492. * comment at the beginning of the function. However, in this
  493. * case we are always dealing with on stack dst.
  494. * In case if pfil(9) sends us back to beginning of the
  495. * function, the dst would be rewritten by ip_output_pfil().
  496. */
  497. MPASS(dst == &sin);
  498. if (nh->nh_flags & NHF_GATEWAY)
  499. dst->sin_addr = nh->gw4_sa.sin_addr;
  500. ia = ifatoia(nh->nh_ifa);
  501. src = IA_SIN(ia)->sin_addr;
  502. isbroadcast = (((nh->nh_flags & (NHF_HOST | NHF_BROADCAST)) ==
  503. (NHF_HOST | NHF_BROADCAST)) ||
  504. ((ifp->if_flags & IFF_BROADCAST) &&
  505. in_ifaddr_broadcast(dst->sin_addr, ia)));
  506. }
  507. /* Catch a possible divide by zero later. */
  508. KASSERT(mtu > 0, ("%s: mtu %d <= 0, ro=%p (nh_flags=0x%08x) ifp=%p",
  509. __func__, mtu, ro,
  510. (ro != NULL && ro->ro_nh != NULL) ? ro->ro_nh->nh_flags : 0, ifp));
  511. if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) {
  512. m->m_flags |= M_MCAST;
  513. /*
  514. * IP destination address is multicast. Make sure "gw"
  515. * still points to the address in "ro". (It may have been
  516. * changed to point to a gateway address, above.)
  517. */
  518. gw = dst;
  519. /*
  520. * See if the caller provided any multicast options
  521. */
  522. if (imo != NULL) {
  523. ip->ip_ttl = imo->imo_multicast_ttl;
  524. if (imo->imo_multicast_vif != -1)
  525. ip->ip_src.s_addr =
  526. ip_mcast_src ?
  527. ip_mcast_src(imo->imo_multicast_vif) :
  528. INADDR_ANY;
  529. } else
  530. ip->ip_ttl = IP_DEFAULT_MULTICAST_TTL;
  531. /*
  532. * Confirm that the outgoing interface supports multicast.
  533. */
  534. if ((imo == NULL) || (imo->imo_multicast_vif == -1)) {
  535. if ((ifp->if_flags & IFF_MULTICAST) == 0) {
  536. IPSTAT_INC(ips_noroute);
  537. error = ENETUNREACH;
  538. goto bad;
  539. }
  540. }
  541. /*
  542. * If source address not specified yet, use address
  543. * of outgoing interface.
  544. */
  545. if (ip->ip_src.s_addr == INADDR_ANY)
  546. ip->ip_src = src;
  547. if ((imo == NULL && in_mcast_loop) ||
  548. (imo && imo->imo_multicast_loop)) {
  549. /*
  550. * Loop back multicast datagram if not expressly
  551. * forbidden to do so, even if we are not a member
  552. * of the group; ip_input() will filter it later,
  553. * thus deferring a hash lookup and mutex acquisition
  554. * at the expense of a cheap copy using m_copym().
  555. */
  556. ip_mloopback(ifp, m, hlen);
  557. } else {
  558. /*
  559. * If we are acting as a multicast router, perform
  560. * multicast forwarding as if the packet had just
  561. * arrived on the interface to which we are about
  562. * to send. The multicast forwarding function
  563. * recursively calls this function, using the
  564. * IP_FORWARDING flag to prevent infinite recursion.
  565. *
  566. * Multicasts that are looped back by ip_mloopback(),
  567. * above, will be forwarded by the ip_input() routine,
  568. * if necessary.
  569. */
  570. if (V_ip_mrouter && (flags & IP_FORWARDING) == 0) {
  571. /*
  572. * If rsvp daemon is not running, do not
  573. * set ip_moptions. This ensures that the packet
  574. * is multicast and not just sent down one link
  575. * as prescribed by rsvpd.
  576. */
  577. if (!V_rsvp_on)
  578. imo = NULL;
  579. if (ip_mforward &&
  580. ip_mforward(ip, ifp, m, imo) != 0) {
  581. m_freem(m);
  582. goto done;
  583. }
  584. }
  585. }
  586. /*
  587. * Multicasts with a time-to-live of zero may be looped-
  588. * back, above, but must not be transmitted on a network.
  589. * Also, multicasts addressed to the loopback interface
  590. * are not sent -- the above call to ip_mloopback() will
  591. * loop back a copy. ip_input() will drop the copy if
  592. * this host does not belong to the destination group on
  593. * the loopback interface.
  594. */
  595. if (ip->ip_ttl == 0 || ifp->if_flags & IFF_LOOPBACK) {
  596. m_freem(m);
  597. goto done;
  598. }
  599. goto sendit;
  600. }
  601. /*
  602. * If the source address is not specified yet, use the address
  603. * of the outoing interface.
  604. */
  605. if (ip->ip_src.s_addr == INADDR_ANY)
  606. ip->ip_src = src;
  607. /*
  608. * Look for broadcast address and
  609. * verify user is allowed to send
  610. * such a packet.
  611. */
  612. if (isbroadcast) {
  613. if ((ifp->if_flags & IFF_BROADCAST) == 0) {
  614. error = EADDRNOTAVAIL;
  615. goto bad;
  616. }
  617. if ((flags & IP_ALLOWBROADCAST) == 0) {
  618. error = EACCES;
  619. goto bad;
  620. }
  621. /* don't allow broadcast messages to be fragmented */
  622. if (ip_len > mtu) {
  623. error = EMSGSIZE;
  624. goto bad;
  625. }
  626. m->m_flags |= M_BCAST;
  627. } else {
  628. m->m_flags &= ~M_BCAST;
  629. }
  630. sendit:
  631. #if defined(IPSEC) || defined(IPSEC_SUPPORT)
  632. if (IPSEC_ENABLED(ipv4)) {
  633. if ((error = IPSEC_OUTPUT(ipv4, m, inp)) != 0) {
  634. if (error == EINPROGRESS)
  635. error = 0;
  636. goto done;
  637. }
  638. }
  639. /*
  640. * Check if there was a route for this packet; return error if not.
  641. */
  642. if (no_route_but_check_spd) {
  643. IPSTAT_INC(ips_noroute);
  644. error = EHOSTUNREACH;
  645. goto bad;
  646. }
  647. /* Update variables that are affected by ipsec4_output(). */
  648. ip = mtod(m, struct ip *);
  649. hlen = ip->ip_hl << 2;
  650. #endif /* IPSEC */
  651. /* Jump over all PFIL processing if hooks are not active. */
  652. if (PFIL_HOOKED_OUT(V_inet_pfil_head)) {
  653. switch (ip_output_pfil(&m, ifp, flags, inp, dst, &fibnum,
  654. &error)) {
  655. case 1: /* Finished */
  656. goto done;
  657. case 0: /* Continue normally */
  658. ip = mtod(m, struct ip *);
  659. break;
  660. case -1: /* Need to try again */
  661. /* Reset everything for a new round */
  662. if (ro != NULL) {
  663. RO_NHFREE(ro);
  664. ro->ro_prepend = NULL;
  665. }
  666. gw = dst;
  667. ip = mtod(m, struct ip *);
  668. goto again;
  669. }
  670. }
  671. if (vlan_pcp > -1)
  672. EVL_APPLY_PRI(m, vlan_pcp);
  673. /* IN_LOOPBACK must not appear on the wire - RFC1122. */
  674. if (IN_LOOPBACK(ntohl(ip->ip_dst.s_addr)) ||
  675. IN_LOOPBACK(ntohl(ip->ip_src.s_addr))) {
  676. if ((ifp->if_flags & IFF_LOOPBACK) == 0) {
  677. IPSTAT_INC(ips_badaddr);
  678. error = EADDRNOTAVAIL;
  679. goto bad;
  680. }
  681. }
  682. m->m_pkthdr.csum_flags |= CSUM_IP;
  683. if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA & ~ifp->if_hwassist) {
  684. m = mb_unmapped_to_ext(m);
  685. if (m == NULL) {
  686. IPSTAT_INC(ips_odropped);
  687. error = ENOBUFS;
  688. goto bad;
  689. }
  690. in_delayed_cksum(m);
  691. m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
  692. } else if ((ifp->if_capenable & IFCAP_NOMAP) == 0) {
  693. m = mb_unmapped_to_ext(m);
  694. if (m == NULL) {
  695. IPSTAT_INC(ips_odropped);
  696. error = ENOBUFS;
  697. goto bad;
  698. }
  699. }
  700. #if defined(SCTP) || defined(SCTP_SUPPORT)
  701. if (m->m_pkthdr.csum_flags & CSUM_SCTP & ~ifp->if_hwassist) {
  702. m = mb_unmapped_to_ext(m);
  703. if (m == NULL) {
  704. IPSTAT_INC(ips_odropped);
  705. error = ENOBUFS;
  706. goto bad;
  707. }
  708. sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2));
  709. m->m_pkthdr.csum_flags &= ~CSUM_SCTP;
  710. }
  711. #endif
  712. /*
  713. * If small enough for interface, or the interface will take
  714. * care of the fragmentation for us, we can just send directly.
  715. * Note that if_vxlan could have requested TSO even though the outer
  716. * frame is UDP. It is correct to not fragment such datagrams and
  717. * instead just pass them on to the driver.
  718. */
  719. if (ip_len <= mtu ||
  720. (m->m_pkthdr.csum_flags & ifp->if_hwassist &
  721. (CSUM_TSO | CSUM_INNER_TSO)) != 0) {
  722. ip->ip_sum = 0;
  723. if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
  724. ip->ip_sum = in_cksum(m, hlen);
  725. m->m_pkthdr.csum_flags &= ~CSUM_IP;
  726. }
  727. /*
  728. * Record statistics for this interface address.
  729. * With CSUM_TSO the byte/packet count will be slightly
  730. * incorrect because we count the IP+TCP headers only
  731. * once instead of for every generated packet.
  732. */
  733. if (!(flags & IP_FORWARDING) && ia) {
  734. if (m->m_pkthdr.csum_flags &
  735. (CSUM_TSO | CSUM_INNER_TSO))
  736. counter_u64_add(ia->ia_ifa.ifa_opackets,
  737. m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
  738. else
  739. counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
  740. counter_u64_add(ia->ia_ifa.ifa_obytes, m->m_pkthdr.len);
  741. }
  742. #ifdef MBUF_STRESS_TEST
  743. if (mbuf_frag_size && m->m_pkthdr.len > mbuf_frag_size)
  744. m = m_fragment(m, M_NOWAIT, mbuf_frag_size);
  745. #endif
  746. /*
  747. * Reset layer specific mbuf flags
  748. * to avoid confusing lower layers.
  749. */
  750. m_clrprotoflags(m);
  751. IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
  752. error = ip_output_send(inp, ifp, m, gw, ro,
  753. (flags & IP_NO_SND_TAG_RL) ? false : true);
  754. goto done;
  755. }
  756. /* Balk when DF bit is set or the interface didn't support TSO. */
  757. if ((ip_off & IP_DF) ||
  758. (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO))) {
  759. error = EMSGSIZE;
  760. IPSTAT_INC(ips_cantfrag);
  761. goto bad;
  762. }
  763. /*
  764. * Too large for interface; fragment if possible. If successful,
  765. * on return, m will point to a list of packets to be sent.
  766. */
  767. error = ip_fragment(ip, &m, mtu, ifp->if_hwassist);
  768. if (error)
  769. goto bad;
  770. for (; m; m = m0) {
  771. m0 = m->m_nextpkt;
  772. m->m_nextpkt = 0;
  773. if (error == 0) {
  774. /* Record statistics for this interface address. */
  775. if (ia != NULL) {
  776. counter_u64_add(ia->ia_ifa.ifa_opackets, 1);
  777. counter_u64_add(ia->ia_ifa.ifa_obytes,
  778. m->m_pkthdr.len);
  779. }
  780. /*
  781. * Reset layer specific mbuf flags
  782. * to avoid confusing upper layers.
  783. */
  784. m_clrprotoflags(m);
  785. IP_PROBE(send, NULL, NULL, mtod(m, struct ip *), ifp,
  786. mtod(m, struct ip *), NULL);
  787. error = ip_output_send(inp, ifp, m, gw, ro, true);
  788. } else
  789. m_freem(m);
  790. }
  791. if (error == 0)
  792. IPSTAT_INC(ips_fragmented);
  793. done:
  794. return (error);
  795. bad:
  796. m_freem(m);
  797. goto done;
  798. }
  799. /*
  800. * Create a chain of fragments which fit the given mtu. m_frag points to the
  801. * mbuf to be fragmented; on return it points to the chain with the fragments.
  802. * Return 0 if no error. If error, m_frag may contain a partially built
  803. * chain of fragments that should be freed by the caller.
  804. *
  805. * if_hwassist_flags is the hw offload capabilities (see if_data.ifi_hwassist)
  806. */
  807. int
  808. ip_fragment(struct ip *ip, struct mbuf **m_frag, int mtu,
  809. u_long if_hwassist_flags)
  810. {
  811. int error = 0;
  812. int hlen = ip->ip_hl << 2;
  813. int len = (mtu - hlen) & ~7; /* size of payload in each fragment */
  814. int off;
  815. struct mbuf *m0 = *m_frag; /* the original packet */
  816. int firstlen;
  817. struct mbuf **mnext;
  818. int nfrags;
  819. uint16_t ip_len, ip_off;
  820. ip_len = ntohs(ip->ip_len);
  821. ip_off = ntohs(ip->ip_off);
  822. if (ip_off & IP_DF) { /* Fragmentation not allowed */
  823. IPSTAT_INC(ips_cantfrag);
  824. return EMSGSIZE;
  825. }
  826. /*
  827. * Must be able to put at least 8 bytes per fragment.
  828. */
  829. if (len < 8)
  830. return EMSGSIZE;
  831. /*
  832. * If the interface will not calculate checksums on
  833. * fragmented packets, then do it here.
  834. */
  835. if (m0->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
  836. m0 = mb_unmapped_to_ext(m0);
  837. if (m0 == NULL) {
  838. error = ENOBUFS;
  839. IPSTAT_INC(ips_odropped);
  840. goto done;
  841. }
  842. in_delayed_cksum(m0);
  843. m0->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
  844. }
  845. #if defined(SCTP) || defined(SCTP_SUPPORT)
  846. if (m0->m_pkthdr.csum_flags & CSUM_SCTP) {
  847. m0 = mb_unmapped_to_ext(m0);
  848. if (m0 == NULL) {
  849. error = ENOBUFS;
  850. IPSTAT_INC(ips_odropped);
  851. goto done;
  852. }
  853. sctp_delayed_cksum(m0, hlen);
  854. m0->m_pkthdr.csum_flags &= ~CSUM_SCTP;
  855. }
  856. #endif
  857. if (len > PAGE_SIZE) {
  858. /*
  859. * Fragment large datagrams such that each segment
  860. * contains a multiple of PAGE_SIZE amount of data,
  861. * plus headers. This enables a receiver to perform
  862. * page-flipping zero-copy optimizations.
  863. *
  864. * XXX When does this help given that sender and receiver
  865. * could have different page sizes, and also mtu could
  866. * be less than the receiver's page size ?
  867. */
  868. int newlen;
  869. off = MIN(mtu, m0->m_pkthdr.len);
  870. /*
  871. * firstlen (off - hlen) must be aligned on an
  872. * 8-byte boundary
  873. */
  874. if (off < hlen)
  875. goto smart_frag_failure;
  876. off = ((off - hlen) & ~7) + hlen;
  877. newlen = (~PAGE_MASK) & mtu;
  878. if ((newlen + sizeof (struct ip)) > mtu) {
  879. /* we failed, go back the default */
  880. smart_frag_failure:
  881. newlen = len;
  882. off = hlen + len;
  883. }
  884. len = newlen;
  885. } else {
  886. off = hlen + len;
  887. }
  888. firstlen = off - hlen;
  889. mnext = &m0->m_nextpkt; /* pointer to next packet */
  890. /*
  891. * Loop through length of segment after first fragment,
  892. * make new header and copy data of each part and link onto chain.
  893. * Here, m0 is the original packet, m is the fragment being created.
  894. * The fragments are linked off the m_nextpkt of the original
  895. * packet, which after processing serves as the first fragment.
  896. */
  897. for (nfrags = 1; off < ip_len; off += len, nfrags++) {
  898. struct ip *mhip; /* ip header on the fragment */
  899. struct mbuf *m;
  900. int mhlen = sizeof (struct ip);
  901. m = m_gethdr(M_NOWAIT, MT_DATA);
  902. if (m == NULL) {
  903. error = ENOBUFS;
  904. IPSTAT_INC(ips_odropped);
  905. goto done;
  906. }
  907. /*
  908. * Make sure the complete packet header gets copied
  909. * from the originating mbuf to the newly created
  910. * mbuf. This also ensures that existing firewall
  911. * classification(s), VLAN tags and so on get copied
  912. * to the resulting fragmented packet(s):
  913. */
  914. if (m_dup_pkthdr(m, m0, M_NOWAIT) == 0) {
  915. m_free(m);
  916. error = ENOBUFS;
  917. IPSTAT_INC(ips_odropped);
  918. goto done;
  919. }
  920. /*
  921. * In the first mbuf, leave room for the link header, then
  922. * copy the original IP header including options. The payload
  923. * goes into an additional mbuf chain returned by m_copym().
  924. */
  925. m->m_data += max_linkhdr;
  926. mhip = mtod(m, struct ip *);
  927. *mhip = *ip;
  928. if (hlen > sizeof (struct ip)) {
  929. mhlen = ip_optcopy(ip, mhip) + sizeof (struct ip);
  930. mhip->ip_v = IPVERSION;
  931. mhip->ip_hl = mhlen >> 2;
  932. }
  933. m->m_len = mhlen;
  934. /* XXX do we need to add ip_off below ? */
  935. mhip->ip_off = ((off - hlen) >> 3) + ip_off;
  936. if (off + len >= ip_len)
  937. len = ip_len - off;
  938. else
  939. mhip->ip_off |= IP_MF;
  940. mhip->ip_len = htons((u_short)(len + mhlen));
  941. m->m_next = m_copym(m0, off, len, M_NOWAIT);
  942. if (m->m_next == NULL) { /* copy failed */
  943. m_free(m);
  944. error = ENOBUFS; /* ??? */
  945. IPSTAT_INC(ips_odropped);
  946. goto done;
  947. }
  948. m->m_pkthdr.len = mhlen + len;
  949. #ifdef MAC
  950. mac_netinet_fragment(m0, m);
  951. #endif
  952. mhip->ip_off = htons(mhip->ip_off);
  953. mhip->ip_sum = 0;
  954. if (m->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
  955. mhip->ip_sum = in_cksum(m, mhlen);
  956. m->m_pkthdr.csum_flags &= ~CSUM_IP;
  957. }
  958. *mnext = m;
  959. mnext = &m->m_nextpkt;
  960. }
  961. IPSTAT_ADD(ips_ofragments, nfrags);
  962. /*
  963. * Update first fragment by trimming what's been copied out
  964. * and updating header.
  965. */
  966. m_adj(m0, hlen + firstlen - ip_len);
  967. m0->m_pkthdr.len = hlen + firstlen;
  968. ip->ip_len = htons((u_short)m0->m_pkthdr.len);
  969. ip->ip_off = htons(ip_off | IP_MF);
  970. ip->ip_sum = 0;
  971. if (m0->m_pkthdr.csum_flags & CSUM_IP & ~if_hwassist_flags) {
  972. ip->ip_sum = in_cksum(m0, hlen);
  973. m0->m_pkthdr.csum_flags &= ~CSUM_IP;
  974. }
  975. done:
  976. *m_frag = m0;
  977. return error;
  978. }
  979. void
  980. in_delayed_cksum(struct mbuf *m)
  981. {
  982. struct ip *ip;
  983. struct udphdr *uh;
  984. uint16_t cklen, csum, offset;
  985. ip = mtod(m, struct ip *);
  986. offset = ip->ip_hl << 2 ;
  987. if (m->m_pkthdr.csum_flags & CSUM_UDP) {
  988. /* if udp header is not in the first mbuf copy udplen */
  989. if (offset + sizeof(struct udphdr) > m->m_len) {
  990. m_copydata(m, offset + offsetof(struct udphdr,
  991. uh_ulen), sizeof(cklen), (caddr_t)&cklen);
  992. cklen = ntohs(cklen);
  993. } else {
  994. uh = (struct udphdr *)mtodo(m, offset);
  995. cklen = ntohs(uh->uh_ulen);
  996. }
  997. csum = in_cksum_skip(m, cklen + offset, offset);
  998. if (csum == 0)
  999. csum = 0xffff;
  1000. } else {
  1001. cklen = ntohs(ip->ip_len);
  1002. csum = in_cksum_skip(m, cklen, offset);
  1003. }
  1004. offset += m->m_pkthdr.csum_data; /* checksum offset */
  1005. if (offset + sizeof(csum) > m->m_len)
  1006. m_copyback(m, offset, sizeof(csum), (caddr_t)&csum);
  1007. else
  1008. *(u_short *)mtodo(m, offset) = csum;
  1009. }
  1010. /*
  1011. * IP socket option processing.
  1012. */
  1013. int
  1014. ip_ctloutput(struct socket *so, struct sockopt *sopt)
  1015. {
  1016. struct inpcb *inp = sotoinpcb(so);
  1017. int error, optval;
  1018. #ifdef RSS
  1019. uint32_t rss_bucket;
  1020. int retval;
  1021. #endif
  1022. error = optval = 0;
  1023. if (sopt->sopt_level != IPPROTO_IP) {
  1024. error = EINVAL;
  1025. if (sopt->sopt_level == SOL_SOCKET &&
  1026. sopt->sopt_dir == SOPT_SET) {
  1027. switch (sopt->sopt_name) {
  1028. case SO_REUSEADDR:
  1029. INP_WLOCK(inp);
  1030. if ((so->so_options & SO_REUSEADDR) != 0)
  1031. inp->inp_flags2 |= INP_REUSEADDR;
  1032. else
  1033. inp->inp_flags2 &= ~INP_REUSEADDR;
  1034. INP_WUNLOCK(inp);
  1035. error = 0;
  1036. break;
  1037. case SO_REUSEPORT:
  1038. INP_WLOCK(inp);
  1039. if ((so->so_options & SO_REUSEPORT) != 0)
  1040. inp->inp_flags2 |= INP_REUSEPORT;
  1041. else
  1042. inp->inp_flags2 &= ~INP_REUSEPORT;
  1043. INP_WUNLOCK(inp);
  1044. error = 0;
  1045. break;
  1046. case SO_REUSEPORT_LB:
  1047. INP_WLOCK(inp);
  1048. if ((so->so_options & SO_REUSEPORT_LB) != 0)
  1049. inp->inp_flags2 |= INP_REUSEPORT_LB;
  1050. else
  1051. inp->inp_flags2 &= ~INP_REUSEPORT_LB;
  1052. INP_WUNLOCK(inp);
  1053. error = 0;
  1054. break;
  1055. case SO_SETFIB:
  1056. INP_WLOCK(inp);
  1057. inp->inp_inc.inc_fibnum = so->so_fibnum;
  1058. INP_WUNLOCK(inp);
  1059. error = 0;
  1060. break;
  1061. case SO_MAX_PACING_RATE:
  1062. #ifdef RATELIMIT
  1063. INP_WLOCK(inp);
  1064. inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED;
  1065. INP_WUNLOCK(inp);
  1066. error = 0;
  1067. #else
  1068. error = EOPNOTSUPP;
  1069. #endif
  1070. break;
  1071. default:
  1072. break;
  1073. }
  1074. }
  1075. return (error);
  1076. }
  1077. switch (sopt->sopt_dir) {
  1078. case SOPT_SET:
  1079. switch (sopt->sopt_name) {
  1080. case IP_OPTIONS:
  1081. #ifdef notyet
  1082. case IP_RETOPTS:
  1083. #endif
  1084. {
  1085. struct mbuf *m;
  1086. if (sopt->sopt_valsize > MLEN) {
  1087. error = EMSGSIZE;
  1088. break;
  1089. }
  1090. m = m_get(sopt->sopt_td ? M_WAITOK : M_NOWAIT, MT_DATA);
  1091. if (m == NULL) {
  1092. error = ENOBUFS;
  1093. break;
  1094. }
  1095. m->m_len = sopt->sopt_valsize;
  1096. error = sooptcopyin(sopt, mtod(m, char *), m->m_len,
  1097. m->m_len);
  1098. if (error) {
  1099. m_free(m);
  1100. break;
  1101. }
  1102. INP_WLOCK(inp);
  1103. error = ip_pcbopts(inp, sopt->sopt_name, m);
  1104. INP_WUNLOCK(inp);
  1105. return (error);
  1106. }
  1107. case IP_BINDANY:
  1108. if (sopt->sopt_td != NULL) {
  1109. error = priv_check(sopt->sopt_td,
  1110. PRIV_NETINET_BINDANY);
  1111. if (error)
  1112. break;
  1113. }
  1114. /* FALLTHROUGH */
  1115. case IP_BINDMULTI:
  1116. #ifdef RSS
  1117. case IP_RSS_LISTEN_BUCKET:
  1118. #endif
  1119. case IP_TOS:
  1120. case IP_TTL:
  1121. case IP_MINTTL:
  1122. case IP_RECVOPTS:
  1123. case IP_RECVRETOPTS:
  1124. case IP_ORIGDSTADDR:
  1125. case IP_RECVDSTADDR:
  1126. case IP_RECVTTL:
  1127. case IP_RECVIF:
  1128. case IP_ONESBCAST:
  1129. case IP_DONTFRAG:
  1130. case IP_RECVTOS:
  1131. case IP_RECVFLOWID:
  1132. #ifdef RSS
  1133. case IP_RECVRSSBUCKETID:
  1134. #endif
  1135. case IP_VLAN_PCP:
  1136. error = sooptcopyin(sopt, &optval, sizeof optval,
  1137. sizeof optval);
  1138. if (error)
  1139. break;
  1140. switch (sopt->sopt_name) {
  1141. case IP_TOS:
  1142. inp->inp_ip_tos = optval;
  1143. break;
  1144. case IP_TTL:
  1145. inp->inp_ip_ttl = optval;
  1146. break;
  1147. case IP_MINTTL:
  1148. if (optval >= 0 && optval <= MAXTTL)
  1149. inp->inp_ip_minttl = optval;
  1150. else
  1151. error = EINVAL;
  1152. break;
  1153. #define OPTSET(bit) do { \
  1154. INP_WLOCK(inp); \
  1155. if (optval) \
  1156. inp->inp_flags |= bit; \
  1157. else \
  1158. inp->inp_flags &= ~bit; \
  1159. INP_WUNLOCK(inp); \
  1160. } while (0)
  1161. #define OPTSET2(bit, val) do { \
  1162. INP_WLOCK(inp); \
  1163. if (val) \
  1164. inp->inp_flags2 |= bit; \
  1165. else \
  1166. inp->inp_flags2 &= ~bit; \
  1167. INP_WUNLOCK(inp); \
  1168. } while (0)
  1169. case IP_RECVOPTS:
  1170. OPTSET(INP_RECVOPTS);
  1171. break;
  1172. case IP_RECVRETOPTS:
  1173. OPTSET(INP_RECVRETOPTS);
  1174. break;
  1175. case IP_RECVDSTADDR:
  1176. OPTSET(INP_RECVDSTADDR);
  1177. break;
  1178. case IP_ORIGDSTADDR:
  1179. OPTSET2(INP_ORIGDSTADDR, optval);
  1180. break;
  1181. case IP_RECVTTL:
  1182. OPTSET(INP_RECVTTL);
  1183. break;
  1184. case IP_RECVIF:
  1185. OPTSET(INP_RECVIF);
  1186. break;
  1187. case IP_ONESBCAST:
  1188. OPTSET(INP_ONESBCAST);
  1189. break;
  1190. case IP_DONTFRAG:
  1191. OPTSET(INP_DONTFRAG);
  1192. break;
  1193. case IP_BINDANY:
  1194. OPTSET(INP_BINDANY);
  1195. break;
  1196. case IP_RECVTOS:
  1197. OPTSET(INP_RECVTOS);
  1198. break;
  1199. case IP_BINDMULTI:
  1200. OPTSET2(INP_BINDMULTI, optval);
  1201. break;
  1202. case IP_RECVFLOWID:
  1203. OPTSET2(INP_RECVFLOWID, optval);
  1204. break;
  1205. #ifdef RSS
  1206. case IP_RSS_LISTEN_BUCKET:
  1207. if ((optval >= 0) &&
  1208. (optval < rss_getnumbuckets())) {
  1209. inp->inp_rss_listen_bucket = optval;
  1210. OPTSET2(INP_RSS_BUCKET_SET, 1);
  1211. } else {
  1212. error = EINVAL;
  1213. }
  1214. break;
  1215. case IP_RECVRSSBUCKETID:
  1216. OPTSET2(INP_RECVRSSBUCKETID, optval);
  1217. break;
  1218. #endif
  1219. case IP_VLAN_PCP:
  1220. if ((optval >= -1) && (optval <=
  1221. (INP_2PCP_MASK >> INP_2PCP_SHIFT))) {
  1222. if (optval == -1) {
  1223. INP_WLOCK(inp);
  1224. inp->inp_flags2 &=
  1225. ~(INP_2PCP_SET |
  1226. INP_2PCP_MASK);
  1227. INP_WUNLOCK(inp);
  1228. } else {
  1229. INP_WLOCK(inp);
  1230. inp->inp_flags2 |=
  1231. INP_2PCP_SET;
  1232. inp->inp_flags2 &=
  1233. ~INP_2PCP_MASK;
  1234. inp->inp_flags2 |=
  1235. optval << INP_2PCP_SHIFT;
  1236. INP_WUNLOCK(inp);
  1237. }
  1238. } else
  1239. error = EINVAL;
  1240. break;
  1241. }
  1242. break;
  1243. #undef OPTSET
  1244. #undef OPTSET2
  1245. /*
  1246. * Multicast socket options are processed by the in_mcast
  1247. * module.
  1248. */
  1249. case IP_MULTICAST_IF:
  1250. case IP_MULTICAST_VIF:
  1251. case IP_MULTICAST_TTL:
  1252. case IP_MULTICAST_LOOP:
  1253. case IP_ADD_MEMBERSHIP:
  1254. case IP_DROP_MEMBERSHIP:
  1255. case IP_ADD_SOURCE_MEMBERSHIP:
  1256. case IP_DROP_SOURCE_MEMBERSHIP:
  1257. case IP_BLOCK_SOURCE:
  1258. case IP_UNBLOCK_SOURCE:
  1259. case IP_MSFILTER:
  1260. case MCAST_JOIN_GROUP:
  1261. case MCAST_LEAVE_GROUP:
  1262. case MCAST_JOIN_SOURCE_GROUP:
  1263. case MCAST_LEAVE_SOURCE_GROUP:
  1264. case MCAST_BLOCK_SOURCE:
  1265. case MCAST_UNBLOCK_SOURCE:
  1266. error = inp_setmoptions(inp, sopt);
  1267. break;
  1268. case IP_PORTRANGE:
  1269. error = sooptcopyin(sopt, &optval, sizeof optval,
  1270. sizeof optval);
  1271. if (error)
  1272. break;
  1273. INP_WLOCK(inp);
  1274. switch (optval) {
  1275. case IP_PORTRANGE_DEFAULT:
  1276. inp->inp_flags &= ~(INP_LOWPORT);
  1277. inp->inp_flags &= ~(INP_HIGHPORT);
  1278. break;
  1279. case IP_PORTRANGE_HIGH:
  1280. inp->inp_flags &= ~(INP_LOWPORT);
  1281. inp->inp_flags |= INP_HIGHPORT;
  1282. break;
  1283. case IP_PORTRANGE_LOW:
  1284. inp->inp_flags &= ~(INP_HIGHPORT);
  1285. inp->inp_flags |= INP_LOWPORT;
  1286. break;
  1287. default:
  1288. error = EINVAL;
  1289. break;
  1290. }
  1291. INP_WUNLOCK(inp);
  1292. break;
  1293. #if defined(IPSEC) || defined(IPSEC_SUPPORT)
  1294. case IP_IPSEC_POLICY:
  1295. if (IPSEC_ENABLED(ipv4)) {
  1296. error = IPSEC_PCBCTL(ipv4, inp, sopt);
  1297. break;
  1298. }
  1299. /* FALLTHROUGH */
  1300. #endif /* IPSEC */
  1301. default:
  1302. error = ENOPROTOOPT;
  1303. break;
  1304. }
  1305. break;
  1306. case SOPT_GET:
  1307. switch (sopt->sopt_name) {
  1308. case IP_OPTIONS:
  1309. case IP_RETOPTS:
  1310. INP_RLOCK(inp);
  1311. if (inp->inp_options) {
  1312. struct mbuf *options;
  1313. options = m_copym(inp->inp_options, 0,
  1314. M_COPYALL, M_NOWAIT);
  1315. INP_RUNLOCK(inp);
  1316. if (options != NULL) {
  1317. error = sooptcopyout(sopt,
  1318. mtod(options, char *),
  1319. options->m_len);
  1320. m_freem(options);
  1321. } else
  1322. error = ENOMEM;
  1323. } else {
  1324. INP_RUNLOCK(inp);
  1325. sopt->sopt_valsize = 0;
  1326. }
  1327. break;
  1328. case IP_TOS:
  1329. case IP_TTL:
  1330. case IP_MINTTL:
  1331. case IP_RECVOPTS:
  1332. case IP_RECVRETOPTS:
  1333. case IP_ORIGDSTADDR:
  1334. case IP_RECVDSTADDR:
  1335. case IP_RECVTTL:
  1336. case IP_RECVIF:
  1337. case IP_PORTRANGE:
  1338. case IP_ONESBCAST:
  1339. case IP_DONTFRAG:
  1340. case IP_BINDANY:
  1341. case IP_RECVTOS:
  1342. case IP_BINDMULTI:
  1343. case IP_FLOWID:
  1344. case IP_FLOWTYPE:
  1345. case IP_RECVFLOWID:
  1346. #ifdef RSS
  1347. case IP_RSSBUCKETID:
  1348. case IP_RECVRSSBUCKETID:
  1349. #endif
  1350. case IP_VLAN_PCP:
  1351. switch (sopt->sopt_name) {
  1352. case IP_TOS:
  1353. optval = inp->inp_ip_tos;
  1354. break;
  1355. case IP_TTL:
  1356. optval = inp->inp_ip_ttl;
  1357. break;
  1358. case IP_MINTTL:
  1359. optval = inp->inp_ip_minttl;
  1360. break;
  1361. #define OPTBIT(bit) (inp->inp_flags & bit ? 1 : 0)
  1362. #define OPTBIT2(bit) (inp->inp_flags2 & bit ? 1 : 0)
  1363. case IP_RECVOPTS:
  1364. optval = OPTBIT(INP_RECVOPTS);
  1365. break;
  1366. case IP_RECVRETOPTS:
  1367. optval = OPTBIT(INP_RECVRETOPTS);
  1368. break;
  1369. case IP_RECVDSTADDR:
  1370. optval = OPTBIT(INP_RECVDSTADDR);
  1371. break;
  1372. case IP_ORIGDSTADDR:
  1373. optval = OPTBIT2(INP_ORIGDSTADDR);
  1374. break;
  1375. case IP_RECVTTL:
  1376. optval = OPTBIT(INP_RECVTTL);
  1377. break;
  1378. case IP_RECVIF:
  1379. optval = OPTBIT(INP_RECVIF);
  1380. break;
  1381. case IP_PORTRANGE:
  1382. if (inp->inp_flags & INP_HIGHPORT)
  1383. optval = IP_PORTRANGE_HIGH;
  1384. else if (inp->inp_flags & INP_LOWPORT)
  1385. optval = IP_PORTRANGE_LOW;
  1386. else
  1387. optval = 0;
  1388. break;
  1389. case IP_ONESBCAST:
  1390. optval = OPTBIT(INP_ONESBCAST);
  1391. break;
  1392. case IP_DONTFRAG:
  1393. optval = OPTBIT(INP_DONTFRAG);
  1394. break;
  1395. case IP_BINDANY:
  1396. optval = OPTBIT(INP_BINDANY);
  1397. break;
  1398. case IP_RECVTOS:
  1399. optval = OPTBIT(INP_RECVTOS);
  1400. break;
  1401. case IP_FLOWID:
  1402. optval = inp->inp_flowid;
  1403. break;
  1404. case IP_FLOWTYPE:
  1405. optval = inp->inp_flowtype;
  1406. break;
  1407. case IP_RECVFLOWID:
  1408. optval = OPTBIT2(INP_RECVFLOWID);
  1409. break;
  1410. #ifdef RSS
  1411. case IP_RSSBUCKETID:
  1412. retval = rss_hash2bucket(inp->inp_flowid,
  1413. inp->inp_flowtype,
  1414. &rss_bucket);
  1415. if (retval == 0)
  1416. optval = rss_bucket;
  1417. else
  1418. error = EINVAL;
  1419. break;
  1420. case IP_RECVRSSBUCKETID:
  1421. optval = OPTBIT2(INP_RECVRSSBUCKETID);
  1422. break;
  1423. #endif
  1424. case IP_BINDMULTI:
  1425. optval = OPTBIT2(INP_BINDMULTI);
  1426. break;
  1427. case IP_VLAN_PCP:
  1428. if (OPTBIT2(INP_2PCP_SET)) {
  1429. optval = (inp->inp_flags2 &
  1430. INP_2PCP_MASK) >> INP_2PCP_SHIFT;
  1431. } else {
  1432. optval = -1;
  1433. }
  1434. break;
  1435. }
  1436. error = sooptcopyout(sopt, &optval, sizeof optval);
  1437. break;
  1438. /*
  1439. * Multicast socket options are processed by the in_mcast
  1440. * module.
  1441. */
  1442. case IP_MULTICAST_IF:
  1443. case IP_MULTICAST_VIF:
  1444. case IP_MULTICAST_TTL:
  1445. case IP_MULTICAST_LOOP:
  1446. case IP_MSFILTER:
  1447. error = inp_getmoptions(inp, sopt);
  1448. break;
  1449. #if defined(IPSEC) || defined(IPSEC_SUPPORT)
  1450. case IP_IPSEC_POLICY:
  1451. if (IPSEC_ENABLED(ipv4)) {
  1452. error = IPSEC_PCBCTL(ipv4, inp, sopt);
  1453. break;
  1454. }
  1455. /* FALLTHROUGH */
  1456. #endif /* IPSEC */
  1457. default:
  1458. error = ENOPROTOOPT;
  1459. break;
  1460. }
  1461. break;
  1462. }
  1463. return (error);
  1464. }
  1465. /*
  1466. * Routine called from ip_output() to loop back a copy of an IP multicast
  1467. * packet to the input queue of a specified interface. Note that this
  1468. * calls the output routine of the loopback "driver", but with an interface
  1469. * pointer that might NOT be a loopback interface -- evil, but easier than
  1470. * replicating that code here.
  1471. */
  1472. static void
  1473. ip_mloopback(struct ifnet *ifp, const struct mbuf *m, int hlen)
  1474. {
  1475. struct ip *ip;
  1476. struct mbuf *copym;
  1477. /*
  1478. * Make a deep copy of the packet because we're going to
  1479. * modify the pack in order to generate checksums.
  1480. */
  1481. copym = m_dup(m, M_NOWAIT);
  1482. if (copym != NULL && (!M_WRITABLE(copym) || copym->m_len < hlen))
  1483. copym = m_pullup(copym, hlen);
  1484. if (copym != NULL) {
  1485. /* If needed, compute the checksum and mark it as valid. */
  1486. if (copym->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
  1487. in_delayed_cksum(copym);
  1488. copym->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
  1489. copym->m_pkthdr.csum_flags |=
  1490. CSUM_DATA_VALID | CSUM_PSEUDO_HDR;
  1491. copym->m_pkthdr.csum_data = 0xffff;
  1492. }
  1493. /*
  1494. * We don't bother to fragment if the IP length is greater
  1495. * than the interface's MTU. Can this possibly matter?
  1496. */
  1497. ip = mtod(copym, struct ip *);
  1498. ip->ip_sum = 0;
  1499. ip->ip_sum = in_cksum(copym, hlen);
  1500. if_simloop(ifp, copym, AF_INET, 0);
  1501. }
  1502. }