qgroup.h 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. /* SPDX-License-Identifier: GPL-2.0 */
  2. /*
  3. * Copyright (C) 2014 Facebook. All rights reserved.
  4. */
  5. #ifndef BTRFS_QGROUP_H
  6. #define BTRFS_QGROUP_H
  7. #include <linux/spinlock.h>
  8. #include <linux/rbtree.h>
  9. #include "ulist.h"
  10. #include "delayed-ref.h"
  11. /*
  12. * Btrfs qgroup overview
  13. *
  14. * Btrfs qgroup splits into 3 main part:
  15. * 1) Reserve
  16. * Reserve metadata/data space for incoming operations
  17. * Affect how qgroup limit works
  18. *
  19. * 2) Trace
  20. * Tell btrfs qgroup to trace dirty extents.
  21. *
  22. * Dirty extents including:
  23. * - Newly allocated extents
  24. * - Extents going to be deleted (in this trans)
  25. * - Extents whose owner is going to be modified
  26. *
  27. * This is the main part affects whether qgroup numbers will stay
  28. * consistent.
  29. * Btrfs qgroup can trace clean extents and won't cause any problem,
  30. * but it will consume extra CPU time, it should be avoided if possible.
  31. *
  32. * 3) Account
  33. * Btrfs qgroup will updates its numbers, based on dirty extents traced
  34. * in previous step.
  35. *
  36. * Normally at qgroup rescan and transaction commit time.
  37. */
  38. /*
  39. * Special performance optimization for balance.
  40. *
  41. * For balance, we need to swap subtree of subvolume and reloc trees.
  42. * In theory, we need to trace all subtree blocks of both subvolume and reloc
  43. * trees, since their owner has changed during such swap.
  44. *
  45. * However since balance has ensured that both subtrees are containing the
  46. * same contents and have the same tree structures, such swap won't cause
  47. * qgroup number change.
  48. *
  49. * But there is a race window between subtree swap and transaction commit,
  50. * during that window, if we increase/decrease tree level or merge/split tree
  51. * blocks, we still need to trace the original subtrees.
  52. *
  53. * So for balance, we use a delayed subtree tracing, whose workflow is:
  54. *
  55. * 1) Record the subtree root block get swapped.
  56. *
  57. * During subtree swap:
  58. * O = Old tree blocks
  59. * N = New tree blocks
  60. * reloc tree subvolume tree X
  61. * Root Root
  62. * / \ / \
  63. * NA OB OA OB
  64. * / | | \ / | | \
  65. * NC ND OE OF OC OD OE OF
  66. *
  67. * In this case, NA and OA are going to be swapped, record (NA, OA) into
  68. * subvolume tree X.
  69. *
  70. * 2) After subtree swap.
  71. * reloc tree subvolume tree X
  72. * Root Root
  73. * / \ / \
  74. * OA OB NA OB
  75. * / | | \ / | | \
  76. * OC OD OE OF NC ND OE OF
  77. *
  78. * 3a) COW happens for OB
  79. * If we are going to COW tree block OB, we check OB's bytenr against
  80. * tree X's swapped_blocks structure.
  81. * If it doesn't fit any, nothing will happen.
  82. *
  83. * 3b) COW happens for NA
  84. * Check NA's bytenr against tree X's swapped_blocks, and get a hit.
  85. * Then we do subtree scan on both subtrees OA and NA.
  86. * Resulting 6 tree blocks to be scanned (OA, OC, OD, NA, NC, ND).
  87. *
  88. * Then no matter what we do to subvolume tree X, qgroup numbers will
  89. * still be correct.
  90. * Then NA's record gets removed from X's swapped_blocks.
  91. *
  92. * 4) Transaction commit
  93. * Any record in X's swapped_blocks gets removed, since there is no
  94. * modification to the swapped subtrees, no need to trigger heavy qgroup
  95. * subtree rescan for them.
  96. */
  97. /*
  98. * Record a dirty extent, and info qgroup to update quota on it
  99. * TODO: Use kmem cache to alloc it.
  100. */
  101. struct btrfs_qgroup_extent_record {
  102. struct rb_node node;
  103. u64 bytenr;
  104. u64 num_bytes;
  105. /*
  106. * For qgroup reserved data space freeing.
  107. *
  108. * @data_rsv_refroot and @data_rsv will be recorded after
  109. * BTRFS_ADD_DELAYED_EXTENT is called.
  110. * And will be used to free reserved qgroup space at
  111. * transaction commit time.
  112. */
  113. u32 data_rsv; /* reserved data space needs to be freed */
  114. u64 data_rsv_refroot; /* which root the reserved data belongs to */
  115. struct ulist *old_roots;
  116. };
  117. struct btrfs_qgroup_swapped_block {
  118. struct rb_node node;
  119. int level;
  120. bool trace_leaf;
  121. /* bytenr/generation of the tree block in subvolume tree after swap */
  122. u64 subvol_bytenr;
  123. u64 subvol_generation;
  124. /* bytenr/generation of the tree block in reloc tree after swap */
  125. u64 reloc_bytenr;
  126. u64 reloc_generation;
  127. u64 last_snapshot;
  128. struct btrfs_key first_key;
  129. };
  130. /*
  131. * Qgroup reservation types:
  132. *
  133. * DATA:
  134. * space reserved for data
  135. *
  136. * META_PERTRANS:
  137. * Space reserved for metadata (per-transaction)
  138. * Due to the fact that qgroup data is only updated at transaction commit
  139. * time, reserved space for metadata must be kept until transaction
  140. * commits.
  141. * Any metadata reserved that are used in btrfs_start_transaction() should
  142. * be of this type.
  143. *
  144. * META_PREALLOC:
  145. * There are cases where metadata space is reserved before starting
  146. * transaction, and then btrfs_join_transaction() to get a trans handle.
  147. * Any metadata reserved for such usage should be of this type.
  148. * And after join_transaction() part (or all) of such reservation should
  149. * be converted into META_PERTRANS.
  150. */
  151. enum btrfs_qgroup_rsv_type {
  152. BTRFS_QGROUP_RSV_DATA,
  153. BTRFS_QGROUP_RSV_META_PERTRANS,
  154. BTRFS_QGROUP_RSV_META_PREALLOC,
  155. BTRFS_QGROUP_RSV_LAST,
  156. };
  157. /*
  158. * Represents how many bytes we have reserved for this qgroup.
  159. *
  160. * Each type should have different reservation behavior.
  161. * E.g, data follows its io_tree flag modification, while
  162. * *currently* meta is just reserve-and-clear during transaction.
  163. *
  164. * TODO: Add new type for reservation which can survive transaction commit.
  165. * Current metadata reservation behavior is not suitable for such case.
  166. */
  167. struct btrfs_qgroup_rsv {
  168. u64 values[BTRFS_QGROUP_RSV_LAST];
  169. };
  170. /*
  171. * one struct for each qgroup, organized in fs_info->qgroup_tree.
  172. */
  173. struct btrfs_qgroup {
  174. u64 qgroupid;
  175. /*
  176. * state
  177. */
  178. u64 rfer; /* referenced */
  179. u64 rfer_cmpr; /* referenced compressed */
  180. u64 excl; /* exclusive */
  181. u64 excl_cmpr; /* exclusive compressed */
  182. /*
  183. * limits
  184. */
  185. u64 lim_flags; /* which limits are set */
  186. u64 max_rfer;
  187. u64 max_excl;
  188. u64 rsv_rfer;
  189. u64 rsv_excl;
  190. /*
  191. * reservation tracking
  192. */
  193. struct btrfs_qgroup_rsv rsv;
  194. /*
  195. * lists
  196. */
  197. struct list_head groups; /* groups this group is member of */
  198. struct list_head members; /* groups that are members of this group */
  199. struct list_head dirty; /* dirty groups */
  200. struct rb_node node; /* tree of qgroups */
  201. /*
  202. * temp variables for accounting operations
  203. * Refer to qgroup_shared_accounting() for details.
  204. */
  205. u64 old_refcnt;
  206. u64 new_refcnt;
  207. };
  208. /*
  209. * For qgroup event trace points only
  210. */
  211. #define QGROUP_RESERVE (1<<0)
  212. #define QGROUP_RELEASE (1<<1)
  213. #define QGROUP_FREE (1<<2)
  214. int btrfs_quota_enable(struct btrfs_fs_info *fs_info);
  215. int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
  216. int btrfs_qgroup_rescan(struct btrfs_fs_info *fs_info);
  217. void btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info);
  218. int btrfs_qgroup_wait_for_completion(struct btrfs_fs_info *fs_info,
  219. bool interruptible);
  220. int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
  221. u64 dst);
  222. int btrfs_del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src,
  223. u64 dst);
  224. int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
  225. int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid);
  226. int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid,
  227. struct btrfs_qgroup_limit *limit);
  228. int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info);
  229. void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info);
  230. struct btrfs_delayed_extent_op;
  231. /*
  232. * Inform qgroup to trace one dirty extent, its info is recorded in @record.
  233. * So qgroup can account it at transaction committing time.
  234. *
  235. * No lock version, caller must acquire delayed ref lock and allocated memory,
  236. * then call btrfs_qgroup_trace_extent_post() after exiting lock context.
  237. *
  238. * Return 0 for success insert
  239. * Return >0 for existing record, caller can free @record safely.
  240. * Error is not possible
  241. */
  242. int btrfs_qgroup_trace_extent_nolock(
  243. struct btrfs_fs_info *fs_info,
  244. struct btrfs_delayed_ref_root *delayed_refs,
  245. struct btrfs_qgroup_extent_record *record);
  246. /*
  247. * Post handler after qgroup_trace_extent_nolock().
  248. *
  249. * NOTE: Current qgroup does the expensive backref walk at transaction
  250. * committing time with TRANS_STATE_COMMIT_DOING, this blocks incoming
  251. * new transaction.
  252. * This is designed to allow btrfs_find_all_roots() to get correct new_roots
  253. * result.
  254. *
  255. * However for old_roots there is no need to do backref walk at that time,
  256. * since we search commit roots to walk backref and result will always be
  257. * correct.
  258. *
  259. * Due to the nature of no lock version, we can't do backref there.
  260. * So we must call btrfs_qgroup_trace_extent_post() after exiting
  261. * spinlock context.
  262. *
  263. * TODO: If we can fix and prove btrfs_find_all_roots() can get correct result
  264. * using current root, then we can move all expensive backref walk out of
  265. * transaction committing, but not now as qgroup accounting will be wrong again.
  266. */
  267. int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
  268. struct btrfs_qgroup_extent_record *qrecord);
  269. /*
  270. * Inform qgroup to trace one dirty extent, specified by @bytenr and
  271. * @num_bytes.
  272. * So qgroup can account it at commit trans time.
  273. *
  274. * Better encapsulated version, with memory allocation and backref walk for
  275. * commit roots.
  276. * So this can sleep.
  277. *
  278. * Return 0 if the operation is done.
  279. * Return <0 for error, like memory allocation failure or invalid parameter
  280. * (NULL trans)
  281. */
  282. int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
  283. u64 num_bytes, gfp_t gfp_flag);
  284. /*
  285. * Inform qgroup to trace all leaf items of data
  286. *
  287. * Return 0 for success
  288. * Return <0 for error(ENOMEM)
  289. */
  290. int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
  291. struct extent_buffer *eb);
  292. /*
  293. * Inform qgroup to trace a whole subtree, including all its child tree
  294. * blocks and data.
  295. * The root tree block is specified by @root_eb.
  296. *
  297. * Normally used by relocation(tree block swap) and subvolume deletion.
  298. *
  299. * Return 0 for success
  300. * Return <0 for error(ENOMEM or tree search error)
  301. */
  302. int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
  303. struct extent_buffer *root_eb,
  304. u64 root_gen, int root_level);
  305. int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
  306. u64 num_bytes, struct ulist *old_roots,
  307. struct ulist *new_roots);
  308. int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans);
  309. int btrfs_run_qgroups(struct btrfs_trans_handle *trans);
  310. int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
  311. u64 objectid, struct btrfs_qgroup_inherit *inherit);
  312. void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
  313. u64 ref_root, u64 num_bytes,
  314. enum btrfs_qgroup_rsv_type type);
  315. #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
  316. int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
  317. u64 rfer, u64 excl);
  318. #endif
  319. /* New io_tree based accurate qgroup reserve API */
  320. int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
  321. struct extent_changeset **reserved, u64 start, u64 len);
  322. int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
  323. int btrfs_qgroup_free_data(struct inode *inode,
  324. struct extent_changeset *reserved, u64 start, u64 len);
  325. int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
  326. enum btrfs_qgroup_rsv_type type, bool enforce);
  327. int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
  328. enum btrfs_qgroup_rsv_type type, bool enforce);
  329. /* Reserve metadata space for pertrans and prealloc type */
  330. static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
  331. int num_bytes, bool enforce)
  332. {
  333. return __btrfs_qgroup_reserve_meta(root, num_bytes,
  334. BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
  335. }
  336. static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
  337. int num_bytes, bool enforce)
  338. {
  339. return __btrfs_qgroup_reserve_meta(root, num_bytes,
  340. BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
  341. }
  342. void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
  343. enum btrfs_qgroup_rsv_type type);
  344. /* Free per-transaction meta reservation for error handling */
  345. static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root,
  346. int num_bytes)
  347. {
  348. __btrfs_qgroup_free_meta(root, num_bytes,
  349. BTRFS_QGROUP_RSV_META_PERTRANS);
  350. }
  351. /* Pre-allocated meta reservation can be freed at need */
  352. static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root,
  353. int num_bytes)
  354. {
  355. __btrfs_qgroup_free_meta(root, num_bytes,
  356. BTRFS_QGROUP_RSV_META_PREALLOC);
  357. }
  358. /*
  359. * Per-transaction meta reservation should be all freed at transaction commit
  360. * time
  361. */
  362. void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
  363. /*
  364. * Convert @num_bytes of META_PREALLOCATED reservation to META_PERTRANS.
  365. *
  366. * This is called when preallocated meta reservation needs to be used.
  367. * Normally after btrfs_join_transaction() call.
  368. */
  369. void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
  370. void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
  371. /* btrfs_qgroup_swapped_blocks related functions */
  372. void btrfs_qgroup_init_swapped_blocks(
  373. struct btrfs_qgroup_swapped_blocks *swapped_blocks);
  374. void btrfs_qgroup_clean_swapped_blocks(struct btrfs_root *root);
  375. int btrfs_qgroup_add_swapped_blocks(struct btrfs_trans_handle *trans,
  376. struct btrfs_root *subvol_root,
  377. struct btrfs_block_group_cache *bg,
  378. struct extent_buffer *subvol_parent, int subvol_slot,
  379. struct extent_buffer *reloc_parent, int reloc_slot,
  380. u64 last_snapshot);
  381. int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
  382. struct btrfs_root *root, struct extent_buffer *eb);
  383. void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans);
  384. #endif