ip_fragment.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * INET An implementation of the TCP/IP protocol suite for the LINUX
  4. * operating system. INET is implemented using the BSD Socket
  5. * interface as the means of communication with the user level.
  6. *
  7. * The IP fragmentation functionality.
  8. *
  9. * Authors: Fred N. van Kempen <waltje@uWalt.NL.Mugnet.ORG>
  10. * Alan Cox <alan@lxorguk.ukuu.org.uk>
  11. *
  12. * Fixes:
  13. * Alan Cox : Split from ip.c , see ip_input.c for history.
  14. * David S. Miller : Begin massive cleanup...
  15. * Andi Kleen : Add sysctls.
  16. * xxxx : Overlapfrag bug.
  17. * Ultima : ip_expire() kernel panic.
  18. * Bill Hawes : Frag accounting and evictor fixes.
  19. * John McDonald : 0 length frag bug.
  20. * Alexey Kuznetsov: SMP races, threading, cleanup.
  21. * Patrick McHardy : LRU queue of frag heads for evictor.
  22. */
  23. #define pr_fmt(fmt) "IPv4: " fmt
  24. #include <linux/compiler.h>
  25. #include <linux/module.h>
  26. #include <linux/types.h>
  27. #include <linux/mm.h>
  28. #include <linux/jiffies.h>
  29. #include <linux/skbuff.h>
  30. #include <linux/list.h>
  31. #include <linux/ip.h>
  32. #include <linux/icmp.h>
  33. #include <linux/netdevice.h>
  34. #include <linux/jhash.h>
  35. #include <linux/random.h>
  36. #include <linux/slab.h>
  37. #include <net/route.h>
  38. #include <net/dst.h>
  39. #include <net/sock.h>
  40. #include <net/ip.h>
  41. #include <net/icmp.h>
  42. #include <net/checksum.h>
  43. #include <net/inetpeer.h>
  44. #include <net/inet_frag.h>
  45. #include <linux/tcp.h>
  46. #include <linux/udp.h>
  47. #include <linux/inet.h>
  48. #include <linux/netfilter_ipv4.h>
  49. #include <net/inet_ecn.h>
  50. #include <net/l3mdev.h>
  51. /* NOTE. Logic of IP defragmentation is parallel to corresponding IPv6
  52. * code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
  53. * as well. Or notify me, at least. --ANK
  54. */
  55. static const char ip_frag_cache_name[] = "ip4-frags";
  56. /* Describe an entry in the "incomplete datagrams" queue. */
  57. struct ipq {
  58. struct inet_frag_queue q;
  59. u8 ecn; /* RFC3168 support */
  60. u16 max_df_size; /* largest frag with DF set seen */
  61. int iif;
  62. unsigned int rid;
  63. struct inet_peer *peer;
  64. };
  65. static u8 ip4_frag_ecn(u8 tos)
  66. {
  67. return 1 << (tos & INET_ECN_MASK);
  68. }
  69. static struct inet_frags ip4_frags;
  70. static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
  71. struct sk_buff *prev_tail, struct net_device *dev);
  72. static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
  73. {
  74. struct ipq *qp = container_of(q, struct ipq, q);
  75. struct netns_ipv4 *ipv4 = container_of(q->net, struct netns_ipv4,
  76. frags);
  77. struct net *net = container_of(ipv4, struct net, ipv4);
  78. const struct frag_v4_compare_key *key = a;
  79. q->key.v4 = *key;
  80. qp->ecn = 0;
  81. qp->peer = q->net->max_dist ?
  82. inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) :
  83. NULL;
  84. }
  85. static void ip4_frag_free(struct inet_frag_queue *q)
  86. {
  87. struct ipq *qp;
  88. qp = container_of(q, struct ipq, q);
  89. if (qp->peer)
  90. inet_putpeer(qp->peer);
  91. }
  92. /* Destruction primitives. */
  93. static void ipq_put(struct ipq *ipq)
  94. {
  95. inet_frag_put(&ipq->q);
  96. }
  97. /* Kill ipq entry. It is not destroyed immediately,
  98. * because caller (and someone more) holds reference count.
  99. */
  100. static void ipq_kill(struct ipq *ipq)
  101. {
  102. inet_frag_kill(&ipq->q);
  103. }
  104. static bool frag_expire_skip_icmp(u32 user)
  105. {
  106. return user == IP_DEFRAG_AF_PACKET ||
  107. ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_IN,
  108. __IP_DEFRAG_CONNTRACK_IN_END) ||
  109. ip_defrag_user_in_between(user, IP_DEFRAG_CONNTRACK_BRIDGE_IN,
  110. __IP_DEFRAG_CONNTRACK_BRIDGE_IN);
  111. }
  112. /*
  113. * Oops, a fragment queue timed out. Kill it and send an ICMP reply.
  114. */
  115. static void ip_expire(struct timer_list *t)
  116. {
  117. struct inet_frag_queue *frag = from_timer(frag, t, timer);
  118. const struct iphdr *iph;
  119. struct sk_buff *head = NULL;
  120. struct net *net;
  121. struct ipq *qp;
  122. int err;
  123. qp = container_of(frag, struct ipq, q);
  124. net = container_of(qp->q.net, struct net, ipv4.frags);
  125. rcu_read_lock();
  126. spin_lock(&qp->q.lock);
  127. if (qp->q.flags & INET_FRAG_COMPLETE)
  128. goto out;
  129. ipq_kill(qp);
  130. __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
  131. __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT);
  132. if (!(qp->q.flags & INET_FRAG_FIRST_IN))
  133. goto out;
  134. /* sk_buff::dev and sk_buff::rbnode are unionized. So we
  135. * pull the head out of the tree in order to be able to
  136. * deal with head->dev.
  137. */
  138. head = inet_frag_pull_head(&qp->q);
  139. if (!head)
  140. goto out;
  141. head->dev = dev_get_by_index_rcu(net, qp->iif);
  142. if (!head->dev)
  143. goto out;
  144. /* skb has no dst, perform route lookup again */
  145. iph = ip_hdr(head);
  146. err = ip_route_input_noref(head, iph->daddr, iph->saddr,
  147. iph->tos, head->dev);
  148. if (err)
  149. goto out;
  150. /* Only an end host needs to send an ICMP
  151. * "Fragment Reassembly Timeout" message, per RFC792.
  152. */
  153. if (frag_expire_skip_icmp(qp->q.key.v4.user) &&
  154. (skb_rtable(head)->rt_type != RTN_LOCAL))
  155. goto out;
  156. spin_unlock(&qp->q.lock);
  157. icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0);
  158. goto out_rcu_unlock;
  159. out:
  160. spin_unlock(&qp->q.lock);
  161. out_rcu_unlock:
  162. rcu_read_unlock();
  163. if (head)
  164. kfree_skb(head);
  165. ipq_put(qp);
  166. }
  167. /* Find the correct entry in the "incomplete datagrams" queue for
  168. * this IP datagram, and create new one, if nothing is found.
  169. */
  170. static struct ipq *ip_find(struct net *net, struct iphdr *iph,
  171. u32 user, int vif)
  172. {
  173. struct frag_v4_compare_key key = {
  174. .saddr = iph->saddr,
  175. .daddr = iph->daddr,
  176. .user = user,
  177. .vif = vif,
  178. .id = iph->id,
  179. .protocol = iph->protocol,
  180. };
  181. struct inet_frag_queue *q;
  182. q = inet_frag_find(&net->ipv4.frags, &key);
  183. if (!q)
  184. return NULL;
  185. return container_of(q, struct ipq, q);
  186. }
  187. /* Is the fragment too far ahead to be part of ipq? */
  188. static int ip_frag_too_far(struct ipq *qp)
  189. {
  190. struct inet_peer *peer = qp->peer;
  191. unsigned int max = qp->q.net->max_dist;
  192. unsigned int start, end;
  193. int rc;
  194. if (!peer || !max)
  195. return 0;
  196. start = qp->rid;
  197. end = atomic_inc_return(&peer->rid);
  198. qp->rid = end;
  199. rc = qp->q.fragments_tail && (end - start) > max;
  200. if (rc) {
  201. struct net *net;
  202. net = container_of(qp->q.net, struct net, ipv4.frags);
  203. __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
  204. }
  205. return rc;
  206. }
  207. static int ip_frag_reinit(struct ipq *qp)
  208. {
  209. unsigned int sum_truesize = 0;
  210. if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) {
  211. refcount_inc(&qp->q.refcnt);
  212. return -ETIMEDOUT;
  213. }
  214. sum_truesize = inet_frag_rbtree_purge(&qp->q.rb_fragments);
  215. sub_frag_mem_limit(qp->q.net, sum_truesize);
  216. qp->q.flags = 0;
  217. qp->q.len = 0;
  218. qp->q.meat = 0;
  219. qp->q.fragments = NULL;
  220. qp->q.rb_fragments = RB_ROOT;
  221. qp->q.fragments_tail = NULL;
  222. qp->q.last_run_head = NULL;
  223. qp->iif = 0;
  224. qp->ecn = 0;
  225. return 0;
  226. }
  227. /* Add new segment to existing queue. */
  228. static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
  229. {
  230. struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
  231. int ihl, end, flags, offset;
  232. struct sk_buff *prev_tail;
  233. struct net_device *dev;
  234. unsigned int fragsize;
  235. int err = -ENOENT;
  236. u8 ecn;
  237. if (qp->q.flags & INET_FRAG_COMPLETE)
  238. goto err;
  239. if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
  240. unlikely(ip_frag_too_far(qp)) &&
  241. unlikely(err = ip_frag_reinit(qp))) {
  242. ipq_kill(qp);
  243. goto err;
  244. }
  245. ecn = ip4_frag_ecn(ip_hdr(skb)->tos);
  246. offset = ntohs(ip_hdr(skb)->frag_off);
  247. flags = offset & ~IP_OFFSET;
  248. offset &= IP_OFFSET;
  249. offset <<= 3; /* offset is in 8-byte chunks */
  250. ihl = ip_hdrlen(skb);
  251. /* Determine the position of this fragment. */
  252. end = offset + skb->len - skb_network_offset(skb) - ihl;
  253. err = -EINVAL;
  254. /* Is this the final fragment? */
  255. if ((flags & IP_MF) == 0) {
  256. /* If we already have some bits beyond end
  257. * or have different end, the segment is corrupted.
  258. */
  259. if (end < qp->q.len ||
  260. ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len))
  261. goto discard_qp;
  262. qp->q.flags |= INET_FRAG_LAST_IN;
  263. qp->q.len = end;
  264. } else {
  265. if (end&7) {
  266. end &= ~7;
  267. if (skb->ip_summed != CHECKSUM_UNNECESSARY)
  268. skb->ip_summed = CHECKSUM_NONE;
  269. }
  270. if (end > qp->q.len) {
  271. /* Some bits beyond end -> corruption. */
  272. if (qp->q.flags & INET_FRAG_LAST_IN)
  273. goto discard_qp;
  274. qp->q.len = end;
  275. }
  276. }
  277. if (end == offset)
  278. goto discard_qp;
  279. err = -ENOMEM;
  280. if (!pskb_pull(skb, skb_network_offset(skb) + ihl))
  281. goto discard_qp;
  282. err = pskb_trim_rcsum(skb, end - offset);
  283. if (err)
  284. goto discard_qp;
  285. /* Note : skb->rbnode and skb->dev share the same location. */
  286. dev = skb->dev;
  287. /* Makes sure compiler wont do silly aliasing games */
  288. barrier();
  289. prev_tail = qp->q.fragments_tail;
  290. err = inet_frag_queue_insert(&qp->q, skb, offset, end);
  291. if (err)
  292. goto insert_error;
  293. if (dev)
  294. qp->iif = dev->ifindex;
  295. qp->q.stamp = skb->tstamp;
  296. qp->q.meat += skb->len;
  297. qp->ecn |= ecn;
  298. add_frag_mem_limit(qp->q.net, skb->truesize);
  299. if (offset == 0)
  300. qp->q.flags |= INET_FRAG_FIRST_IN;
  301. fragsize = skb->len + ihl;
  302. if (fragsize > qp->q.max_size)
  303. qp->q.max_size = fragsize;
  304. if (ip_hdr(skb)->frag_off & htons(IP_DF) &&
  305. fragsize > qp->max_df_size)
  306. qp->max_df_size = fragsize;
  307. if (qp->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
  308. qp->q.meat == qp->q.len) {
  309. unsigned long orefdst = skb->_skb_refdst;
  310. skb->_skb_refdst = 0UL;
  311. err = ip_frag_reasm(qp, skb, prev_tail, dev);
  312. skb->_skb_refdst = orefdst;
  313. if (err)
  314. inet_frag_kill(&qp->q);
  315. return err;
  316. }
  317. skb_dst_drop(skb);
  318. return -EINPROGRESS;
  319. insert_error:
  320. if (err == IPFRAG_DUP) {
  321. kfree_skb(skb);
  322. return -EINVAL;
  323. }
  324. err = -EINVAL;
  325. __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS);
  326. discard_qp:
  327. inet_frag_kill(&qp->q);
  328. __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
  329. err:
  330. kfree_skb(skb);
  331. return err;
  332. }
  333. /* Build a new IP datagram from all its fragments. */
  334. static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb,
  335. struct sk_buff *prev_tail, struct net_device *dev)
  336. {
  337. struct net *net = container_of(qp->q.net, struct net, ipv4.frags);
  338. struct iphdr *iph;
  339. void *reasm_data;
  340. int len, err;
  341. u8 ecn;
  342. ipq_kill(qp);
  343. ecn = ip_frag_ecn_table[qp->ecn];
  344. if (unlikely(ecn == 0xff)) {
  345. err = -EINVAL;
  346. goto out_fail;
  347. }
  348. /* Make the one we just received the head. */
  349. reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail);
  350. if (!reasm_data)
  351. goto out_nomem;
  352. len = ip_hdrlen(skb) + qp->q.len;
  353. err = -E2BIG;
  354. if (len > 65535)
  355. goto out_oversize;
  356. inet_frag_reasm_finish(&qp->q, skb, reasm_data);
  357. skb->dev = dev;
  358. IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size);
  359. iph = ip_hdr(skb);
  360. iph->tot_len = htons(len);
  361. iph->tos |= ecn;
  362. /* When we set IP_DF on a refragmented skb we must also force a
  363. * call to ip_fragment to avoid forwarding a DF-skb of size s while
  364. * original sender only sent fragments of size f (where f < s).
  365. *
  366. * We only set DF/IPSKB_FRAG_PMTU if such DF fragment was the largest
  367. * frag seen to avoid sending tiny DF-fragments in case skb was built
  368. * from one very small df-fragment and one large non-df frag.
  369. */
  370. if (qp->max_df_size == qp->q.max_size) {
  371. IPCB(skb)->flags |= IPSKB_FRAG_PMTU;
  372. iph->frag_off = htons(IP_DF);
  373. } else {
  374. iph->frag_off = 0;
  375. }
  376. ip_send_check(iph);
  377. __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS);
  378. qp->q.fragments = NULL;
  379. qp->q.rb_fragments = RB_ROOT;
  380. qp->q.fragments_tail = NULL;
  381. qp->q.last_run_head = NULL;
  382. return 0;
  383. out_nomem:
  384. net_dbg_ratelimited("queue_glue: no memory for gluing queue %p\n", qp);
  385. err = -ENOMEM;
  386. goto out_fail;
  387. out_oversize:
  388. net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr);
  389. out_fail:
  390. __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
  391. return err;
  392. }
  393. /* Process an incoming IP datagram fragment. */
  394. int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
  395. {
  396. struct net_device *dev = skb->dev ? : skb_dst(skb)->dev;
  397. int vif = l3mdev_master_ifindex_rcu(dev);
  398. struct ipq *qp;
  399. __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS);
  400. skb_orphan(skb);
  401. /* Lookup (or create) queue header */
  402. qp = ip_find(net, ip_hdr(skb), user, vif);
  403. if (qp) {
  404. int ret;
  405. spin_lock(&qp->q.lock);
  406. ret = ip_frag_queue(qp, skb);
  407. spin_unlock(&qp->q.lock);
  408. ipq_put(qp);
  409. return ret;
  410. }
  411. __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS);
  412. kfree_skb(skb);
  413. return -ENOMEM;
  414. }
  415. EXPORT_SYMBOL(ip_defrag);
  416. struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user)
  417. {
  418. struct iphdr iph;
  419. int netoff;
  420. u32 len;
  421. if (skb->protocol != htons(ETH_P_IP))
  422. return skb;
  423. netoff = skb_network_offset(skb);
  424. if (skb_copy_bits(skb, netoff, &iph, sizeof(iph)) < 0)
  425. return skb;
  426. if (iph.ihl < 5 || iph.version != 4)
  427. return skb;
  428. len = ntohs(iph.tot_len);
  429. if (skb->len < netoff + len || len < (iph.ihl * 4))
  430. return skb;
  431. if (ip_is_fragment(&iph)) {
  432. skb = skb_share_check(skb, GFP_ATOMIC);
  433. if (skb) {
  434. if (!pskb_may_pull(skb, netoff + iph.ihl * 4)) {
  435. kfree_skb(skb);
  436. return NULL;
  437. }
  438. if (pskb_trim_rcsum(skb, netoff + len)) {
  439. kfree_skb(skb);
  440. return NULL;
  441. }
  442. memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
  443. if (ip_defrag(net, skb, user))
  444. return NULL;
  445. skb_clear_hash(skb);
  446. }
  447. }
  448. return skb;
  449. }
  450. EXPORT_SYMBOL(ip_check_defrag);
  451. #ifdef CONFIG_SYSCTL
  452. static int dist_min;
  453. static struct ctl_table ip4_frags_ns_ctl_table[] = {
  454. {
  455. .procname = "ipfrag_high_thresh",
  456. .data = &init_net.ipv4.frags.high_thresh,
  457. .maxlen = sizeof(unsigned long),
  458. .mode = 0644,
  459. .proc_handler = proc_doulongvec_minmax,
  460. .extra1 = &init_net.ipv4.frags.low_thresh
  461. },
  462. {
  463. .procname = "ipfrag_low_thresh",
  464. .data = &init_net.ipv4.frags.low_thresh,
  465. .maxlen = sizeof(unsigned long),
  466. .mode = 0644,
  467. .proc_handler = proc_doulongvec_minmax,
  468. .extra2 = &init_net.ipv4.frags.high_thresh
  469. },
  470. {
  471. .procname = "ipfrag_time",
  472. .data = &init_net.ipv4.frags.timeout,
  473. .maxlen = sizeof(int),
  474. .mode = 0644,
  475. .proc_handler = proc_dointvec_jiffies,
  476. },
  477. {
  478. .procname = "ipfrag_max_dist",
  479. .data = &init_net.ipv4.frags.max_dist,
  480. .maxlen = sizeof(int),
  481. .mode = 0644,
  482. .proc_handler = proc_dointvec_minmax,
  483. .extra1 = &dist_min,
  484. },
  485. { }
  486. };
  487. /* secret interval has been deprecated */
  488. static int ip4_frags_secret_interval_unused;
  489. static struct ctl_table ip4_frags_ctl_table[] = {
  490. {
  491. .procname = "ipfrag_secret_interval",
  492. .data = &ip4_frags_secret_interval_unused,
  493. .maxlen = sizeof(int),
  494. .mode = 0644,
  495. .proc_handler = proc_dointvec_jiffies,
  496. },
  497. { }
  498. };
  499. static int __net_init ip4_frags_ns_ctl_register(struct net *net)
  500. {
  501. struct ctl_table *table;
  502. struct ctl_table_header *hdr;
  503. table = ip4_frags_ns_ctl_table;
  504. if (!net_eq(net, &init_net)) {
  505. table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL);
  506. if (!table)
  507. goto err_alloc;
  508. table[0].data = &net->ipv4.frags.high_thresh;
  509. table[0].extra1 = &net->ipv4.frags.low_thresh;
  510. table[0].extra2 = &init_net.ipv4.frags.high_thresh;
  511. table[1].data = &net->ipv4.frags.low_thresh;
  512. table[1].extra2 = &net->ipv4.frags.high_thresh;
  513. table[2].data = &net->ipv4.frags.timeout;
  514. table[3].data = &net->ipv4.frags.max_dist;
  515. }
  516. hdr = register_net_sysctl(net, "net/ipv4", table);
  517. if (!hdr)
  518. goto err_reg;
  519. net->ipv4.frags_hdr = hdr;
  520. return 0;
  521. err_reg:
  522. if (!net_eq(net, &init_net))
  523. kfree(table);
  524. err_alloc:
  525. return -ENOMEM;
  526. }
  527. static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net)
  528. {
  529. struct ctl_table *table;
  530. table = net->ipv4.frags_hdr->ctl_table_arg;
  531. unregister_net_sysctl_table(net->ipv4.frags_hdr);
  532. kfree(table);
  533. }
  534. static void __init ip4_frags_ctl_register(void)
  535. {
  536. register_net_sysctl(&init_net, "net/ipv4", ip4_frags_ctl_table);
  537. }
  538. #else
  539. static int ip4_frags_ns_ctl_register(struct net *net)
  540. {
  541. return 0;
  542. }
  543. static void ip4_frags_ns_ctl_unregister(struct net *net)
  544. {
  545. }
  546. static void __init ip4_frags_ctl_register(void)
  547. {
  548. }
  549. #endif
  550. static int __net_init ipv4_frags_init_net(struct net *net)
  551. {
  552. int res;
  553. /* Fragment cache limits.
  554. *
  555. * The fragment memory accounting code, (tries to) account for
  556. * the real memory usage, by measuring both the size of frag
  557. * queue struct (inet_frag_queue (ipv4:ipq/ipv6:frag_queue))
  558. * and the SKB's truesize.
  559. *
  560. * A 64K fragment consumes 129736 bytes (44*2944)+200
  561. * (1500 truesize == 2944, sizeof(struct ipq) == 200)
  562. *
  563. * We will commit 4MB at one time. Should we cross that limit
  564. * we will prune down to 3MB, making room for approx 8 big 64K
  565. * fragments 8x128k.
  566. */
  567. net->ipv4.frags.high_thresh = 4 * 1024 * 1024;
  568. net->ipv4.frags.low_thresh = 3 * 1024 * 1024;
  569. /*
  570. * Important NOTE! Fragment queue must be destroyed before MSL expires.
  571. * RFC791 is wrong proposing to prolongate timer each fragment arrival
  572. * by TTL.
  573. */
  574. net->ipv4.frags.timeout = IP_FRAG_TIME;
  575. net->ipv4.frags.max_dist = 64;
  576. net->ipv4.frags.f = &ip4_frags;
  577. res = inet_frags_init_net(&net->ipv4.frags);
  578. if (res < 0)
  579. return res;
  580. res = ip4_frags_ns_ctl_register(net);
  581. if (res < 0)
  582. inet_frags_exit_net(&net->ipv4.frags);
  583. return res;
  584. }
  585. static void __net_exit ipv4_frags_exit_net(struct net *net)
  586. {
  587. ip4_frags_ns_ctl_unregister(net);
  588. inet_frags_exit_net(&net->ipv4.frags);
  589. }
  590. static struct pernet_operations ip4_frags_ops = {
  591. .init = ipv4_frags_init_net,
  592. .exit = ipv4_frags_exit_net,
  593. };
  594. static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed)
  595. {
  596. return jhash2(data,
  597. sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
  598. }
  599. static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed)
  600. {
  601. const struct inet_frag_queue *fq = data;
  602. return jhash2((const u32 *)&fq->key.v4,
  603. sizeof(struct frag_v4_compare_key) / sizeof(u32), seed);
  604. }
  605. static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr)
  606. {
  607. const struct frag_v4_compare_key *key = arg->key;
  608. const struct inet_frag_queue *fq = ptr;
  609. return !!memcmp(&fq->key, key, sizeof(*key));
  610. }
  611. static const struct rhashtable_params ip4_rhash_params = {
  612. .head_offset = offsetof(struct inet_frag_queue, node),
  613. .key_offset = offsetof(struct inet_frag_queue, key),
  614. .key_len = sizeof(struct frag_v4_compare_key),
  615. .hashfn = ip4_key_hashfn,
  616. .obj_hashfn = ip4_obj_hashfn,
  617. .obj_cmpfn = ip4_obj_cmpfn,
  618. .automatic_shrinking = true,
  619. };
  620. void __init ipfrag_init(void)
  621. {
  622. ip4_frags.constructor = ip4_frag_init;
  623. ip4_frags.destructor = ip4_frag_free;
  624. ip4_frags.qsize = sizeof(struct ipq);
  625. ip4_frags.frag_expire = ip_expire;
  626. ip4_frags.frags_cache_name = ip_frag_cache_name;
  627. ip4_frags.rhash_params = ip4_rhash_params;
  628. if (inet_frags_init(&ip4_frags))
  629. panic("IP: failed to allocate ip4_frags cache\n");
  630. ip4_frags_ctl_register();
  631. register_pernet_subsys(&ip4_frags_ops);
  632. }