bloom.h 3.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113
  1. #ifndef BLOOM_H
  2. #define BLOOM_H
  3. struct commit;
  4. struct repository;
  5. struct bloom_filter_settings {
  6. /*
  7. * The version of the hashing technique being used.
  8. * We currently only support version = 1 which is
  9. * the seeded murmur3 hashing technique implemented
  10. * in bloom.c.
  11. */
  12. uint32_t hash_version;
  13. /*
  14. * The number of times a path is hashed, i.e. the
  15. * number of bit positions tht cumulatively
  16. * determine whether a path is present in the
  17. * Bloom filter.
  18. */
  19. uint32_t num_hashes;
  20. /*
  21. * The minimum number of bits per entry in the Bloom
  22. * filter. If the filter contains 'n' entries, then
  23. * filter size is the minimum number of 8-bit words
  24. * that contain n*b bits.
  25. */
  26. uint32_t bits_per_entry;
  27. /*
  28. * The maximum number of changed paths per commit
  29. * before declaring a Bloom filter to be too-large.
  30. *
  31. * Not written to the commit-graph file.
  32. */
  33. uint32_t max_changed_paths;
  34. };
  35. #define DEFAULT_BLOOM_MAX_CHANGES 512
  36. #define DEFAULT_BLOOM_FILTER_SETTINGS { 1, 7, 10, DEFAULT_BLOOM_MAX_CHANGES }
  37. #define BITS_PER_WORD 8
  38. #define BLOOMDATA_CHUNK_HEADER_SIZE 3 * sizeof(uint32_t)
  39. /*
  40. * A bloom_filter struct represents a data segment to
  41. * use when testing hash values. The 'len' member
  42. * dictates how many entries are stored in
  43. * 'data'.
  44. */
  45. struct bloom_filter {
  46. unsigned char *data;
  47. size_t len;
  48. };
  49. /*
  50. * A bloom_key represents the k hash values for a
  51. * given string. These can be precomputed and
  52. * stored in a bloom_key for re-use when testing
  53. * against a bloom_filter. The number of hashes is
  54. * given by the Bloom filter settings and is the same
  55. * for all Bloom filters and keys interacting with
  56. * the loaded version of the commit graph file and
  57. * the Bloom data chunks.
  58. */
  59. struct bloom_key {
  60. uint32_t *hashes;
  61. };
  62. /*
  63. * Calculate the murmur3 32-bit hash value for the given data
  64. * using the given seed.
  65. * Produces a uniformly distributed hash value.
  66. * Not considered to be cryptographically secure.
  67. * Implemented as described in https://en.wikipedia.org/wiki/MurmurHash#Algorithm
  68. */
  69. uint32_t murmur3_seeded(uint32_t seed, const char *data, size_t len);
  70. void fill_bloom_key(const char *data,
  71. size_t len,
  72. struct bloom_key *key,
  73. const struct bloom_filter_settings *settings);
  74. void clear_bloom_key(struct bloom_key *key);
  75. void add_key_to_filter(const struct bloom_key *key,
  76. struct bloom_filter *filter,
  77. const struct bloom_filter_settings *settings);
  78. void init_bloom_filters(void);
  79. enum bloom_filter_computed {
  80. BLOOM_NOT_COMPUTED = (1 << 0),
  81. BLOOM_COMPUTED = (1 << 1),
  82. BLOOM_TRUNC_LARGE = (1 << 2),
  83. BLOOM_TRUNC_EMPTY = (1 << 3),
  84. };
  85. struct bloom_filter *get_or_compute_bloom_filter(struct repository *r,
  86. struct commit *c,
  87. int compute_if_not_present,
  88. const struct bloom_filter_settings *settings,
  89. enum bloom_filter_computed *computed);
  90. #define get_bloom_filter(r, c) get_or_compute_bloom_filter( \
  91. (r), (c), 0, NULL, NULL)
  92. int bloom_filter_contains(const struct bloom_filter *filter,
  93. const struct bloom_key *key,
  94. const struct bloom_filter_settings *settings);
  95. #endif