mbcache.c 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437
  1. #include <linux/spinlock.h>
  2. #include <linux/slab.h>
  3. #include <linux/list.h>
  4. #include <linux/list_bl.h>
  5. #include <linux/module.h>
  6. #include <linux/sched.h>
  7. #include <linux/workqueue.h>
  8. #include <linux/mbcache.h>
  9. /*
  10. * Mbcache is a simple key-value store. Keys need not be unique, however
  11. * key-value pairs are expected to be unique (we use this fact in
  12. * mb_cache_entry_delete()).
  13. *
  14. * Ext2 and ext4 use this cache for deduplication of extended attribute blocks.
  15. * Ext4 also uses it for deduplication of xattr values stored in inodes.
  16. * They use hash of data as a key and provide a value that may represent a
  17. * block or inode number. That's why keys need not be unique (hash of different
  18. * data may be the same). However user provided value always uniquely
  19. * identifies a cache entry.
  20. *
  21. * We provide functions for creation and removal of entries, search by key,
  22. * and a special "delete entry with given key-value pair" operation. Fixed
  23. * size hash table is used for fast key lookups.
  24. */
  25. struct mb_cache {
  26. /* Hash table of entries */
  27. struct hlist_bl_head *c_hash;
  28. /* log2 of hash table size */
  29. int c_bucket_bits;
  30. /* Maximum entries in cache to avoid degrading hash too much */
  31. unsigned long c_max_entries;
  32. /* Protects c_list, c_entry_count */
  33. spinlock_t c_list_lock;
  34. struct list_head c_list;
  35. /* Number of entries in cache */
  36. unsigned long c_entry_count;
  37. struct shrinker c_shrink;
  38. /* Work for shrinking when the cache has too many entries */
  39. struct work_struct c_shrink_work;
  40. };
  41. static struct kmem_cache *mb_entry_cache;
  42. static unsigned long mb_cache_shrink(struct mb_cache *cache,
  43. unsigned long nr_to_scan);
  44. static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
  45. u32 key)
  46. {
  47. return &cache->c_hash[hash_32(key, cache->c_bucket_bits)];
  48. }
  49. /*
  50. * Number of entries to reclaim synchronously when there are too many entries
  51. * in cache
  52. */
  53. #define SYNC_SHRINK_BATCH 64
  54. /*
  55. * mb_cache_entry_create - create entry in cache
  56. * @cache - cache where the entry should be created
  57. * @mask - gfp mask with which the entry should be allocated
  58. * @key - key of the entry
  59. * @value - value of the entry
  60. * @reusable - is the entry reusable by others?
  61. *
  62. * Creates entry in @cache with key @key and value @value. The function returns
  63. * -EBUSY if entry with the same key and value already exists in cache.
  64. * Otherwise 0 is returned.
  65. */
  66. int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key,
  67. u64 value, bool reusable)
  68. {
  69. struct mb_cache_entry *entry, *dup;
  70. struct hlist_bl_node *dup_node;
  71. struct hlist_bl_head *head;
  72. /* Schedule background reclaim if there are too many entries */
  73. if (cache->c_entry_count >= cache->c_max_entries)
  74. schedule_work(&cache->c_shrink_work);
  75. /* Do some sync reclaim if background reclaim cannot keep up */
  76. if (cache->c_entry_count >= 2*cache->c_max_entries)
  77. mb_cache_shrink(cache, SYNC_SHRINK_BATCH);
  78. entry = kmem_cache_alloc(mb_entry_cache, mask);
  79. if (!entry)
  80. return -ENOMEM;
  81. INIT_LIST_HEAD(&entry->e_list);
  82. /* One ref for hash, one ref returned */
  83. atomic_set(&entry->e_refcnt, 1);
  84. entry->e_key = key;
  85. entry->e_value = value;
  86. entry->e_reusable = reusable;
  87. entry->e_referenced = 0;
  88. head = mb_cache_entry_head(cache, key);
  89. hlist_bl_lock(head);
  90. hlist_bl_for_each_entry(dup, dup_node, head, e_hash_list) {
  91. if (dup->e_key == key && dup->e_value == value) {
  92. hlist_bl_unlock(head);
  93. kmem_cache_free(mb_entry_cache, entry);
  94. return -EBUSY;
  95. }
  96. }
  97. hlist_bl_add_head(&entry->e_hash_list, head);
  98. hlist_bl_unlock(head);
  99. spin_lock(&cache->c_list_lock);
  100. list_add_tail(&entry->e_list, &cache->c_list);
  101. /* Grab ref for LRU list */
  102. atomic_inc(&entry->e_refcnt);
  103. cache->c_entry_count++;
  104. spin_unlock(&cache->c_list_lock);
  105. return 0;
  106. }
  107. EXPORT_SYMBOL(mb_cache_entry_create);
  108. void __mb_cache_entry_free(struct mb_cache_entry *entry)
  109. {
  110. kmem_cache_free(mb_entry_cache, entry);
  111. }
  112. EXPORT_SYMBOL(__mb_cache_entry_free);
  113. static struct mb_cache_entry *__entry_find(struct mb_cache *cache,
  114. struct mb_cache_entry *entry,
  115. u32 key)
  116. {
  117. struct mb_cache_entry *old_entry = entry;
  118. struct hlist_bl_node *node;
  119. struct hlist_bl_head *head;
  120. head = mb_cache_entry_head(cache, key);
  121. hlist_bl_lock(head);
  122. if (entry && !hlist_bl_unhashed(&entry->e_hash_list))
  123. node = entry->e_hash_list.next;
  124. else
  125. node = hlist_bl_first(head);
  126. while (node) {
  127. entry = hlist_bl_entry(node, struct mb_cache_entry,
  128. e_hash_list);
  129. if (entry->e_key == key && entry->e_reusable) {
  130. atomic_inc(&entry->e_refcnt);
  131. goto out;
  132. }
  133. node = node->next;
  134. }
  135. entry = NULL;
  136. out:
  137. hlist_bl_unlock(head);
  138. if (old_entry)
  139. mb_cache_entry_put(cache, old_entry);
  140. return entry;
  141. }
  142. /*
  143. * mb_cache_entry_find_first - find the first reusable entry with the given key
  144. * @cache: cache where we should search
  145. * @key: key to look for
  146. *
  147. * Search in @cache for a reusable entry with key @key. Grabs reference to the
  148. * first reusable entry found and returns the entry.
  149. */
  150. struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
  151. u32 key)
  152. {
  153. return __entry_find(cache, NULL, key);
  154. }
  155. EXPORT_SYMBOL(mb_cache_entry_find_first);
  156. /*
  157. * mb_cache_entry_find_next - find next reusable entry with the same key
  158. * @cache: cache where we should search
  159. * @entry: entry to start search from
  160. *
  161. * Finds next reusable entry in the hash chain which has the same key as @entry.
  162. * If @entry is unhashed (which can happen when deletion of entry races with the
  163. * search), finds the first reusable entry in the hash chain. The function drops
  164. * reference to @entry and returns with a reference to the found entry.
  165. */
  166. struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
  167. struct mb_cache_entry *entry)
  168. {
  169. return __entry_find(cache, entry, entry->e_key);
  170. }
  171. EXPORT_SYMBOL(mb_cache_entry_find_next);
  172. /*
  173. * mb_cache_entry_get - get a cache entry by value (and key)
  174. * @cache - cache we work with
  175. * @key - key
  176. * @value - value
  177. */
  178. struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key,
  179. u64 value)
  180. {
  181. struct hlist_bl_node *node;
  182. struct hlist_bl_head *head;
  183. struct mb_cache_entry *entry;
  184. head = mb_cache_entry_head(cache, key);
  185. hlist_bl_lock(head);
  186. hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
  187. if (entry->e_key == key && entry->e_value == value) {
  188. atomic_inc(&entry->e_refcnt);
  189. goto out;
  190. }
  191. }
  192. entry = NULL;
  193. out:
  194. hlist_bl_unlock(head);
  195. return entry;
  196. }
  197. EXPORT_SYMBOL(mb_cache_entry_get);
  198. /* mb_cache_entry_delete - remove a cache entry
  199. * @cache - cache we work with
  200. * @key - key
  201. * @value - value
  202. *
  203. * Remove entry from cache @cache with key @key and value @value.
  204. */
  205. void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value)
  206. {
  207. struct hlist_bl_node *node;
  208. struct hlist_bl_head *head;
  209. struct mb_cache_entry *entry;
  210. head = mb_cache_entry_head(cache, key);
  211. hlist_bl_lock(head);
  212. hlist_bl_for_each_entry(entry, node, head, e_hash_list) {
  213. if (entry->e_key == key && entry->e_value == value) {
  214. /* We keep hash list reference to keep entry alive */
  215. hlist_bl_del_init(&entry->e_hash_list);
  216. hlist_bl_unlock(head);
  217. spin_lock(&cache->c_list_lock);
  218. if (!list_empty(&entry->e_list)) {
  219. list_del_init(&entry->e_list);
  220. if (!WARN_ONCE(cache->c_entry_count == 0,
  221. "mbcache: attempt to decrement c_entry_count past zero"))
  222. cache->c_entry_count--;
  223. atomic_dec(&entry->e_refcnt);
  224. }
  225. spin_unlock(&cache->c_list_lock);
  226. mb_cache_entry_put(cache, entry);
  227. return;
  228. }
  229. }
  230. hlist_bl_unlock(head);
  231. }
  232. EXPORT_SYMBOL(mb_cache_entry_delete);
  233. /* mb_cache_entry_touch - cache entry got used
  234. * @cache - cache the entry belongs to
  235. * @entry - entry that got used
  236. *
  237. * Marks entry as used to give hit higher chances of surviving in cache.
  238. */
  239. void mb_cache_entry_touch(struct mb_cache *cache,
  240. struct mb_cache_entry *entry)
  241. {
  242. entry->e_referenced = 1;
  243. }
  244. EXPORT_SYMBOL(mb_cache_entry_touch);
  245. static unsigned long mb_cache_count(struct shrinker *shrink,
  246. struct shrink_control *sc)
  247. {
  248. struct mb_cache *cache = container_of(shrink, struct mb_cache,
  249. c_shrink);
  250. return cache->c_entry_count;
  251. }
  252. /* Shrink number of entries in cache */
  253. static unsigned long mb_cache_shrink(struct mb_cache *cache,
  254. unsigned long nr_to_scan)
  255. {
  256. struct mb_cache_entry *entry;
  257. struct hlist_bl_head *head;
  258. unsigned long shrunk = 0;
  259. spin_lock(&cache->c_list_lock);
  260. while (nr_to_scan-- && !list_empty(&cache->c_list)) {
  261. entry = list_first_entry(&cache->c_list,
  262. struct mb_cache_entry, e_list);
  263. if (entry->e_referenced) {
  264. entry->e_referenced = 0;
  265. list_move_tail(&entry->e_list, &cache->c_list);
  266. continue;
  267. }
  268. list_del_init(&entry->e_list);
  269. cache->c_entry_count--;
  270. /*
  271. * We keep LRU list reference so that entry doesn't go away
  272. * from under us.
  273. */
  274. spin_unlock(&cache->c_list_lock);
  275. head = mb_cache_entry_head(cache, entry->e_key);
  276. hlist_bl_lock(head);
  277. if (!hlist_bl_unhashed(&entry->e_hash_list)) {
  278. hlist_bl_del_init(&entry->e_hash_list);
  279. atomic_dec(&entry->e_refcnt);
  280. }
  281. hlist_bl_unlock(head);
  282. if (mb_cache_entry_put(cache, entry))
  283. shrunk++;
  284. cond_resched();
  285. spin_lock(&cache->c_list_lock);
  286. }
  287. spin_unlock(&cache->c_list_lock);
  288. return shrunk;
  289. }
  290. static unsigned long mb_cache_scan(struct shrinker *shrink,
  291. struct shrink_control *sc)
  292. {
  293. struct mb_cache *cache = container_of(shrink, struct mb_cache,
  294. c_shrink);
  295. return mb_cache_shrink(cache, sc->nr_to_scan);
  296. }
  297. /* We shrink 1/X of the cache when we have too many entries in it */
  298. #define SHRINK_DIVISOR 16
  299. static void mb_cache_shrink_worker(struct work_struct *work)
  300. {
  301. struct mb_cache *cache = container_of(work, struct mb_cache,
  302. c_shrink_work);
  303. mb_cache_shrink(cache, cache->c_max_entries / SHRINK_DIVISOR);
  304. }
  305. /*
  306. * mb_cache_create - create cache
  307. * @bucket_bits: log2 of the hash table size
  308. *
  309. * Create cache for keys with 2^bucket_bits hash entries.
  310. */
  311. struct mb_cache *mb_cache_create(int bucket_bits)
  312. {
  313. struct mb_cache *cache;
  314. unsigned long bucket_count = 1UL << bucket_bits;
  315. unsigned long i;
  316. cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
  317. if (!cache)
  318. goto err_out;
  319. cache->c_bucket_bits = bucket_bits;
  320. cache->c_max_entries = bucket_count << 4;
  321. INIT_LIST_HEAD(&cache->c_list);
  322. spin_lock_init(&cache->c_list_lock);
  323. cache->c_hash = kmalloc_array(bucket_count,
  324. sizeof(struct hlist_bl_head),
  325. GFP_KERNEL);
  326. if (!cache->c_hash) {
  327. kfree(cache);
  328. goto err_out;
  329. }
  330. for (i = 0; i < bucket_count; i++)
  331. INIT_HLIST_BL_HEAD(&cache->c_hash[i]);
  332. cache->c_shrink.count_objects = mb_cache_count;
  333. cache->c_shrink.scan_objects = mb_cache_scan;
  334. cache->c_shrink.seeks = DEFAULT_SEEKS;
  335. if (register_shrinker(&cache->c_shrink)) {
  336. kfree(cache->c_hash);
  337. kfree(cache);
  338. goto err_out;
  339. }
  340. INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker);
  341. return cache;
  342. err_out:
  343. return NULL;
  344. }
  345. EXPORT_SYMBOL(mb_cache_create);
  346. /*
  347. * mb_cache_destroy - destroy cache
  348. * @cache: the cache to destroy
  349. *
  350. * Free all entries in cache and cache itself. Caller must make sure nobody
  351. * (except shrinker) can reach @cache when calling this.
  352. */
  353. void mb_cache_destroy(struct mb_cache *cache)
  354. {
  355. struct mb_cache_entry *entry, *next;
  356. unregister_shrinker(&cache->c_shrink);
  357. /*
  358. * We don't bother with any locking. Cache must not be used at this
  359. * point.
  360. */
  361. list_for_each_entry_safe(entry, next, &cache->c_list, e_list) {
  362. if (!hlist_bl_unhashed(&entry->e_hash_list)) {
  363. hlist_bl_del_init(&entry->e_hash_list);
  364. atomic_dec(&entry->e_refcnt);
  365. } else
  366. WARN_ON(1);
  367. list_del(&entry->e_list);
  368. WARN_ON(atomic_read(&entry->e_refcnt) != 1);
  369. mb_cache_entry_put(cache, entry);
  370. }
  371. kfree(cache->c_hash);
  372. kfree(cache);
  373. }
  374. EXPORT_SYMBOL(mb_cache_destroy);
  375. static int __init mbcache_init(void)
  376. {
  377. mb_entry_cache = kmem_cache_create("mbcache",
  378. sizeof(struct mb_cache_entry), 0,
  379. SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
  380. if (!mb_entry_cache)
  381. return -ENOMEM;
  382. return 0;
  383. }
  384. static void __exit mbcache_exit(void)
  385. {
  386. kmem_cache_destroy(mb_entry_cache);
  387. }
  388. module_init(mbcache_init)
  389. module_exit(mbcache_exit)
  390. MODULE_AUTHOR("Jan Kara <jack@suse.cz>");
  391. MODULE_DESCRIPTION("Meta block cache (for extended attributes)");
  392. MODULE_LICENSE("GPL");