weakhash.go 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. // Copyright (C) 2016 The Syncthing Authors.
  2. //
  3. // This Source Code Form is subject to the terms of the Mozilla Public
  4. // License, v. 2.0. If a copy of the MPL was not distributed with this file,
  5. // You can obtain one at https://mozilla.org/MPL/2.0/.
  6. package weakhash
  7. import (
  8. "bufio"
  9. "context"
  10. "io"
  11. "github.com/chmduquesne/rollinghash/adler32"
  12. )
  13. const (
  14. Size = 4
  15. // don't track more hits than this for any given weakhash
  16. maxWeakhashFinderHits = 10
  17. )
  18. // Find finds all the blocks of the given size within io.Reader that matches
  19. // the hashes provided, and returns a hash -> slice of offsets within reader
  20. // map, that produces the same weak hash.
  21. func Find(ctx context.Context, ir io.Reader, hashesToFind []uint32, size int) (map[uint32][]int64, error) {
  22. if ir == nil || len(hashesToFind) == 0 {
  23. return nil, nil
  24. }
  25. r := bufio.NewReader(ir)
  26. hf := adler32.New()
  27. n, err := io.CopyN(hf, r, int64(size))
  28. if err == io.EOF {
  29. return nil, nil
  30. }
  31. if err != nil {
  32. return nil, err
  33. }
  34. if n != int64(size) {
  35. return nil, io.ErrShortBuffer
  36. }
  37. offsets := make(map[uint32][]int64)
  38. for _, hashToFind := range hashesToFind {
  39. offsets[hashToFind] = make([]int64, 0, maxWeakhashFinderHits)
  40. }
  41. var i int64
  42. var hash uint32
  43. for {
  44. select {
  45. case <-ctx.Done():
  46. return nil, ctx.Err()
  47. default:
  48. }
  49. hash = hf.Sum32()
  50. if existing, ok := offsets[hash]; ok && len(existing) < maxWeakhashFinderHits {
  51. offsets[hash] = append(existing, i)
  52. }
  53. i++
  54. bt, err := r.ReadByte()
  55. if err == io.EOF {
  56. break
  57. } else if err != nil {
  58. return offsets, err
  59. }
  60. hf.Roll(bt)
  61. }
  62. return offsets, nil
  63. }
  64. func NewFinder(ctx context.Context, ir io.ReadSeeker, size int, hashesToFind []uint32) (*Finder, error) {
  65. offsets, err := Find(ctx, ir, hashesToFind, size)
  66. if err != nil {
  67. return nil, err
  68. }
  69. return &Finder{
  70. reader: ir,
  71. size: size,
  72. offsets: offsets,
  73. }, nil
  74. }
  75. type Finder struct {
  76. reader io.ReadSeeker
  77. size int
  78. offsets map[uint32][]int64
  79. }
  80. // Iterate iterates all available blocks that matches the provided hash, reads
  81. // them into buf, and calls the iterator function. The iterator function should
  82. // return whether it wishes to continue iterating.
  83. func (h *Finder) Iterate(hash uint32, buf []byte, iterFunc func(int64) bool) (bool, error) {
  84. if h == nil || hash == 0 || len(buf) != h.size {
  85. return false, nil
  86. }
  87. for _, offset := range h.offsets[hash] {
  88. _, err := h.reader.Seek(offset, io.SeekStart)
  89. if err != nil {
  90. return false, err
  91. }
  92. _, err = h.reader.Read(buf)
  93. if err != nil {
  94. return false, err
  95. }
  96. if !iterFunc(offset) {
  97. return true, nil
  98. }
  99. }
  100. return false, nil
  101. }