dedupe.go 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. // Package dedupe provides the dedupe command.
  2. package dedupe
  3. import (
  4. "context"
  5. "log"
  6. "github.com/rclone/rclone/cmd"
  7. "github.com/rclone/rclone/fs"
  8. "github.com/rclone/rclone/fs/config/flags"
  9. "github.com/rclone/rclone/fs/operations"
  10. "github.com/spf13/cobra"
  11. )
  12. var (
  13. dedupeMode = operations.DeduplicateInteractive
  14. byHash = false
  15. )
  16. func init() {
  17. cmd.Root.AddCommand(commandDefinition)
  18. cmdFlag := commandDefinition.Flags()
  19. flags.FVarP(cmdFlag, &dedupeMode, "dedupe-mode", "", "Dedupe mode interactive|skip|first|newest|oldest|largest|smallest|rename", "")
  20. flags.BoolVarP(cmdFlag, &byHash, "by-hash", "", false, "Find identical hashes rather than names", "")
  21. }
  22. var commandDefinition = &cobra.Command{
  23. Use: "dedupe [mode] remote:path",
  24. Short: `Interactively find duplicate filenames and delete/rename them.`,
  25. Long: `
  26. By default ` + "`dedupe`" + ` interactively finds files with duplicate
  27. names and offers to delete all but one or rename them to be
  28. different. This is known as deduping by name.
  29. Deduping by name is only useful with a small group of backends (e.g. Google Drive,
  30. Opendrive) that can have duplicate file names. It can be run on wrapping backends
  31. (e.g. crypt) if they wrap a backend which supports duplicate file
  32. names.
  33. However if ` + "`--by-hash`" + ` is passed in then dedupe will find files with
  34. duplicate hashes instead which will work on any backend which supports
  35. at least one hash. This can be used to find files with duplicate
  36. content. This is known as deduping by hash.
  37. If deduping by name, first rclone will merge directories with the same
  38. name. It will do this iteratively until all the identically named
  39. directories have been merged.
  40. Next, if deduping by name, for every group of duplicate file names /
  41. hashes, it will delete all but one identical file it finds without
  42. confirmation. This means that for most duplicated files the ` +
  43. "`dedupe`" + ` command will not be interactive.
  44. ` + "`dedupe`" + ` considers files to be identical if they have the
  45. same file path and the same hash. If the backend does not support hashes (e.g. crypt wrapping
  46. Google Drive) then they will never be found to be identical. If you
  47. use the ` + "`--size-only`" + ` flag then files will be considered
  48. identical if they have the same size (any hash will be ignored). This
  49. can be useful on crypt backends which do not support hashes.
  50. Next rclone will resolve the remaining duplicates. Exactly which
  51. action is taken depends on the dedupe mode. By default, rclone will
  52. interactively query the user for each one.
  53. **Important**: Since this can cause data loss, test first with the
  54. ` + "`--dry-run` or the `--interactive`/`-i`" + ` flag.
  55. Here is an example run.
  56. Before - with duplicates
  57. $ rclone lsl drive:dupes
  58. 6048320 2016-03-05 16:23:16.798000000 one.txt
  59. 6048320 2016-03-05 16:23:11.775000000 one.txt
  60. 564374 2016-03-05 16:23:06.731000000 one.txt
  61. 6048320 2016-03-05 16:18:26.092000000 one.txt
  62. 6048320 2016-03-05 16:22:46.185000000 two.txt
  63. 1744073 2016-03-05 16:22:38.104000000 two.txt
  64. 564374 2016-03-05 16:22:52.118000000 two.txt
  65. Now the ` + "`dedupe`" + ` session
  66. $ rclone dedupe drive:dupes
  67. 2016/03/05 16:24:37 Google drive root 'dupes': Looking for duplicates using interactive mode.
  68. one.txt: Found 4 files with duplicate names
  69. one.txt: Deleting 2/3 identical duplicates (MD5 "1eedaa9fe86fd4b8632e2ac549403b36")
  70. one.txt: 2 duplicates remain
  71. 1: 6048320 bytes, 2016-03-05 16:23:16.798000000, MD5 1eedaa9fe86fd4b8632e2ac549403b36
  72. 2: 564374 bytes, 2016-03-05 16:23:06.731000000, MD5 7594e7dc9fc28f727c42ee3e0749de81
  73. s) Skip and do nothing
  74. k) Keep just one (choose which in next step)
  75. r) Rename all to be different (by changing file.jpg to file-1.jpg)
  76. s/k/r> k
  77. Enter the number of the file to keep> 1
  78. one.txt: Deleted 1 extra copies
  79. two.txt: Found 3 files with duplicate names
  80. two.txt: 3 duplicates remain
  81. 1: 564374 bytes, 2016-03-05 16:22:52.118000000, MD5 7594e7dc9fc28f727c42ee3e0749de81
  82. 2: 6048320 bytes, 2016-03-05 16:22:46.185000000, MD5 1eedaa9fe86fd4b8632e2ac549403b36
  83. 3: 1744073 bytes, 2016-03-05 16:22:38.104000000, MD5 851957f7fb6f0bc4ce76be966d336802
  84. s) Skip and do nothing
  85. k) Keep just one (choose which in next step)
  86. r) Rename all to be different (by changing file.jpg to file-1.jpg)
  87. s/k/r> r
  88. two-1.txt: renamed from: two.txt
  89. two-2.txt: renamed from: two.txt
  90. two-3.txt: renamed from: two.txt
  91. The result being
  92. $ rclone lsl drive:dupes
  93. 6048320 2016-03-05 16:23:16.798000000 one.txt
  94. 564374 2016-03-05 16:22:52.118000000 two-1.txt
  95. 6048320 2016-03-05 16:22:46.185000000 two-2.txt
  96. 1744073 2016-03-05 16:22:38.104000000 two-3.txt
  97. Dedupe can be run non interactively using the ` + "`" + `--dedupe-mode` + "`" + ` flag or by using an extra parameter with the same value
  98. * ` + "`" + `--dedupe-mode interactive` + "`" + ` - interactive as above.
  99. * ` + "`" + `--dedupe-mode skip` + "`" + ` - removes identical files then skips anything left.
  100. * ` + "`" + `--dedupe-mode first` + "`" + ` - removes identical files then keeps the first one.
  101. * ` + "`" + `--dedupe-mode newest` + "`" + ` - removes identical files then keeps the newest one.
  102. * ` + "`" + `--dedupe-mode oldest` + "`" + ` - removes identical files then keeps the oldest one.
  103. * ` + "`" + `--dedupe-mode largest` + "`" + ` - removes identical files then keeps the largest one.
  104. * ` + "`" + `--dedupe-mode smallest` + "`" + ` - removes identical files then keeps the smallest one.
  105. * ` + "`" + `--dedupe-mode rename` + "`" + ` - removes identical files then renames the rest to be different.
  106. * ` + "`" + `--dedupe-mode list` + "`" + ` - lists duplicate dirs and files only and changes nothing.
  107. For example, to rename all the identically named photos in your Google Photos directory, do
  108. rclone dedupe --dedupe-mode rename "drive:Google Photos"
  109. Or
  110. rclone dedupe rename "drive:Google Photos"
  111. `,
  112. Annotations: map[string]string{
  113. "versionIntroduced": "v1.27",
  114. "groups": "Important",
  115. },
  116. Run: func(command *cobra.Command, args []string) {
  117. cmd.CheckArgs(1, 2, command, args)
  118. if len(args) > 1 {
  119. err := dedupeMode.Set(args[0])
  120. if err != nil {
  121. log.Fatal(err)
  122. }
  123. args = args[1:]
  124. }
  125. fdst := cmd.NewFsSrc(args)
  126. if !byHash && !fdst.Features().DuplicateFiles {
  127. fs.Logf(fdst, "Can't have duplicate names here. Perhaps you wanted --by-hash ? Continuing anyway.")
  128. }
  129. cmd.Run(false, false, command, func() error {
  130. return operations.Deduplicate(context.Background(), fdst, dedupeMode, byHash)
  131. })
  132. },
  133. }