hsync.go 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660
  1. // Copyright © 2015 Pierre Neidhardt <ambrevar at gmail dot com>
  2. // Use of this file is governed by the license that can be found in LICENSE.
  3. /*
  4. A filesystem hierarchy synchronizer
  5. Rename files in TARGET so that identical files found in SOURCE and TARGET have
  6. the same relative path.
  7. The main goal of the program is to make folders synchronization faster by
  8. sparing big file transfers when a simple rename suffices. It complements other
  9. synchronization programs that lack this capability.
  10. See http://ambrevar.bitbucket.org/hsync and 'hsync -h' for more details.
  11. Usage:
  12. hsync [OPTIONS] SOURCE TARGET
  13. For usage options, see:
  14. hsync -h
  15. Implementation details
  16. TODO: review this doc.
  17. We store the file entries in the following structure:
  18. entries := map[partialHash struct{size int64, pos int64, hash string}]fileMatch struct{
  19. sourceID *fileID{path string, h hash.Hash},
  20. targetID *fileID{path string, h hash.Hash}
  21. }
  22. This 'entries' map indexes the possible file matches by content ('partialHash').
  23. A file match references the paths that will be used to rename the file in TARGET
  24. from 'oldpath' to 'newpath'. Note that 'newpath' is given by 'sourceID.path',
  25. and 'oldpath' by 'targetID.path'.
  26. The algorithm is centered around one main optimization: rolling-checksums. We
  27. assume that two files match if they have the same partial hash.
  28. The initial partial hash is just the size with an empty hash. This speeds up the
  29. process since this saves an open/close of the file. We just need a 'stat'. Files
  30. will not be read unless a rolling-checksum is required. As a consequence,
  31. unreadable files with a unique size will be stored in 'entries', while
  32. unreadable conflicting files will be discarded. Note that the system allows to
  33. rename files that cannot be read.
  34. One checksum roll increments 'pos' and updates the hash by hashing the next
  35. BLOCKSIZE bytes of the file. BLOCKSIZE is set to a value that is commonly
  36. believed to be optimal in most cases. The optimal value would be the device
  37. blocksize where the file resides. It would be more complex and memory consuming
  38. to query this value for each file.
  39. We choose md5 (128 bits) as the checksum algorithm. Adler32, CRC-32 and CRC-64
  40. are only a tiny little faster while suffering from more clashes. This choice
  41. should be backed up with a proper benchmark.
  42. A conflict arises when two files in either SOURCE or TARGET have the same
  43. partial hash. We solve the conflict by updating the partial hashes until they
  44. differ. If the partial hashes cannot be updated any further (i.e. we reached
  45. end-of-file), it means that the files are duplicates.
  46. Notes:
  47. - Partial hashes of conflicting files will be complete at the same roll since
  48. they have the same size.
  49. - When a partial hash is complete, we have the following relation:
  50. (pos-1)*BLOCKSIZE < filesize <= pos*BLOCKSIZE
  51. - There is only one possible conflicting file at a time.
  52. A file match may be erroneous if the partial hash is not complete. The most
  53. obvious case is when two different files are the only ones of size N in SOURCE
  54. and TARGET. This down-side is a consequence of the design choice, i.e. focus on
  55. speed. Erroneous matches can be corrected in the preview file. If we wanted no
  56. ambiguity, we would have to compute the full hashes and this would take
  57. approximately as much time as copying files from SOURCE to TARGET, like a
  58. regular synchronization tool would do.
  59. We store the digest 'hash.Hash' together with the file path for when we update a
  60. partial hash.
  61. Process:
  62. 1. We walk SOURCE completely. Only regular files are processed. The 'sourceID'
  63. are stored. If two entries conflict (they have the same partial hash), we
  64. compute update the partial hashes until they do not conflict anymore. If the
  65. conflict is not resolvable, i.e. the partial hash is complete and files are
  66. identical, we drop both files from 'entries'.
  67. Future files can have the same partial hash that led to a former conflict. To
  68. distinguish the content from former conflicts when adding a new file, we must
  69. compute the partial hash up to the 'pos' of the last conflict (the number of
  70. checksum rolls). To keep track of this 'pos' when there is a conflict, we mark
  71. all computed partial hash as dummy values. When the next entry will be added, we
  72. will have to compute the partial hash until it does not match a dummy value in
  73. 'entries'.
  74. Duplicates are not processed but display a warning. Usually the user does not
  75. want duplicates, so she is better off fixing them before processing with the
  76. renames. It would add a lot of complexity to handle duplicates properly.
  77. 2. We walk TARGET completely. We skip all dummies as source the SOURCE walk.
  78. We need to analyze SOURCE completely before we can check for matches.
  79. - If there are only dummy entries, there was an unsolvable conflict in SOURCE.
  80. We drop the file.
  81. - If we end on a non-empty entry with an 'unsolvable' targetID, it means that an
  82. unsolvable conflict with target files happened with this partial hash. This is
  83. only possible at end-of-file. We drop the file.
  84. - If we end on an empty entry, there is no match with SOURCE and we drop the
  85. file.
  86. - If we end on a non-empty entry without previous matches, we store the
  87. match.
  88. - Else we end on a non-empty entry with one match already present. This is a
  89. conflict. We solve the conflict as for the SOURCE walk except that we need to
  90. update the partial hashes of three files: the SOURCE file, the first TARGET
  91. match and the new TARGET match.
  92. 3. We generate the 'renameOps' and 'reverseOps' maps. They map 'oldpath' to
  93. 'newpath' and 'newpath' to 'oldpath' respectively. We drop entries where
  94. 'oldpath==newpath' to spare a lot of noise.
  95. Note that file names are not used to compute a match since they could be
  96. identical while the content would be different.
  97. 4. We proceed with the renames. Chains and cycles may occur.
  98. - Example of a chain of renames: a->b, b->c, c->d.
  99. - Example of a cycle of renames: a->b, b->c, c->a.
  100. TARGET must be fully analyzed before proceeding with the renames so that we can
  101. detect chains.
  102. We always traverse chains until we reach the end, then rename the elements while
  103. going backward till the beginning. The beginning can be before the entry point.
  104. 'reverseOps' is used for going backward.
  105. When a cycle is detected, we break it by renaming one file to a temporary name.
  106. Then we process everything as for a chain. Finally we rename the temporary file
  107. to its original newpath.
  108. */
  109. package main
  110. import (
  111. "crypto/md5"
  112. "encoding/json"
  113. "flag"
  114. "fmt"
  115. "hash"
  116. "io"
  117. "io/ioutil"
  118. "log"
  119. "os"
  120. "path/filepath"
  121. )
  122. const (
  123. APPLICATION = "hsync"
  124. VERSION = "1.0"
  125. COPYRIGHT = "Copyright (C) 2015 Pierre Neidhardt"
  126. BLOCKSIZE = 4096
  127. SEPARATOR = string(os.PathSeparator)
  128. )
  129. var usage = `Filesystem hierarchy synchronizer
  130. Rename files in TARGET so that identical files found in SOURCE and TARGET have
  131. the same relative path.
  132. The main goal of the program is to make folders synchronization faster by
  133. sparing big file transfers when a simple rename suffices. It complements other
  134. synchronization programs that lack this capability.
  135. By default, files are not renamed and a preview is printed to standard output.
  136. False positives can happen, e.g. if two different files in SOURCE and TARGET are
  137. the only ones of this size. Use the preview to spot false positives and make sure
  138. all files get renamed properly.
  139. You can redirect the preview to a file. If you run the program using this
  140. preview file as SOURCE, the analysis will be skipped. This is useful if you want
  141. to tweak the result of the analysis.
  142. Notes:
  143. - Duplicate files in either folder are skipped.
  144. - Only regular files are processed. In particular, empty folders and symbolic
  145. links are ignored.
  146. `
  147. // We attach a hash digest to the path so that we can update partial hashes with
  148. // the rolling-checksum function.
  149. type fileID struct {
  150. path string
  151. h hash.Hash
  152. }
  153. var unsolvable = fileID{path: SEPARATOR}
  154. // A fileMatch stores 2 fileID with matching content. A match can be partial and
  155. // further processing can disprove it.
  156. // - If 'sourceID==nil', this is a dummy match. It means that a file of the same
  157. // size with a longer partialHash has been processed.
  158. // - If 'targetID==nil', a match is yet to be found.
  159. // - If 'targetID==&unsolvable', several TARGET files conflict together for this
  160. // SOURCE file, the entry should be skipped.
  161. type fileMatch struct {
  162. sourceID *fileID
  163. targetID *fileID
  164. }
  165. // partialHash is used as a key to identify the file content.
  166. // The 'size' field should always be set, however the 'pos' and 'hash' fields
  167. // are computed only when required. No hash has been computed when 'pos==0'.
  168. type partialHash struct {
  169. size int64
  170. pos int64
  171. hash string
  172. }
  173. // rollingChecksum returns io.EOF on last roll.
  174. // The caller needs not open `file`; it needs to close it however. This manual
  175. // management avoids having to open and close the file repeatedly.
  176. func rollingChecksum(fid *fileID, key *partialHash, file **os.File) (err error) {
  177. if *file == nil {
  178. *file, err = os.Open(fid.path)
  179. if err != nil {
  180. return
  181. }
  182. }
  183. buf := [BLOCKSIZE]byte{}
  184. n, err := (*file).ReadAt(buf[:], key.pos*BLOCKSIZE)
  185. if err != nil && err != io.EOF {
  186. return
  187. }
  188. fid.h.Write(buf[:n])
  189. key.pos++
  190. key.hash = string(fid.h.Sum(nil))
  191. return
  192. }
  193. func newFileEntry(path string, size int64) (fileID, partialHash) {
  194. return fileID{path: path, h: md5.New()}, partialHash{size: size}
  195. }
  196. func visitSource(root string, entries map[partialHash]fileMatch) {
  197. // Change folder to 'root' so that 'root' does not get stored in fileID.path.
  198. oldroot, err := os.Getwd()
  199. if err != nil {
  200. log.Fatal(err)
  201. }
  202. err = os.Chdir(root)
  203. if err != nil {
  204. log.Fatal(err)
  205. }
  206. defer os.Chdir(oldroot)
  207. visitor := func(input string, info os.FileInfo, ignored error) error {
  208. if !info.Mode().IsRegular() {
  209. return nil
  210. }
  211. inputID, inputKey := newFileEntry(input, info.Size())
  212. var err error
  213. var inputFile, conflictFile *os.File
  214. defer func() {
  215. if inputFile != nil {
  216. inputFile.Close()
  217. }
  218. }()
  219. defer func() {
  220. if conflictFile != nil {
  221. conflictFile.Close()
  222. }
  223. }()
  224. // Skip dummy matches.
  225. v, ok := entries[inputKey]
  226. for ok && v.sourceID == nil && err != io.EOF {
  227. err = rollingChecksum(&inputID, &inputKey, &inputFile)
  228. if err != nil && err != io.EOF {
  229. log.Println(err)
  230. return nil
  231. }
  232. v, ok = entries[inputKey]
  233. }
  234. if ok && v.sourceID == nil {
  235. log.Printf("Source duplicate: %v (%x)\n", inputID.path, inputKey.hash)
  236. return nil
  237. } else if !ok {
  238. entries[inputKey] = fileMatch{sourceID: &inputID}
  239. return nil
  240. }
  241. // Else there is a conflict.
  242. conflictKey := inputKey
  243. conflictID := entries[inputKey].sourceID
  244. for inputKey == conflictKey && err == nil {
  245. // Set dummy value to mark the key as visited for future files.
  246. entries[inputKey] = fileMatch{}
  247. err = rollingChecksum(&inputID, &inputKey, &inputFile)
  248. if err != nil && err != io.EOF {
  249. // Read error. Drop input.
  250. log.Println(err)
  251. return nil
  252. }
  253. err = rollingChecksum(conflictID, &conflictKey, &conflictFile)
  254. if err != nil && err != io.EOF {
  255. // Read error. We will replace conflict with input.
  256. log.Println(err)
  257. break
  258. }
  259. }
  260. if inputKey == conflictKey && err == io.EOF {
  261. entries[inputKey] = fileMatch{}
  262. log.Printf("Source duplicate: %v (%x)\n", inputID.path, inputKey.hash)
  263. log.Printf("Source duplicate: %v (%x)\n", conflictID.path, conflictKey.hash)
  264. } else {
  265. // Resolved conflict.
  266. entries[inputKey] = fileMatch{sourceID: &inputID}
  267. if err == nil || err == io.EOF {
  268. // Re-add conflicting file except on read error.
  269. entries[conflictKey] = fileMatch{sourceID: conflictID}
  270. }
  271. }
  272. return nil
  273. }
  274. // Since we do not stop on read errors while walking, the returned error is
  275. // always nil.
  276. filepath.Walk(".", visitor)
  277. }
  278. // See comments in visitSource.
  279. func visitTarget(root, sourceRoot string, entries map[partialHash]fileMatch) {
  280. oldroot, err := os.Getwd()
  281. if err != nil {
  282. log.Fatal(err)
  283. }
  284. err = os.Chdir(root)
  285. if err != nil {
  286. log.Fatal(err)
  287. }
  288. defer os.Chdir(oldroot)
  289. visitor := func(input string, info os.FileInfo, ignored error) error {
  290. if !info.Mode().IsRegular() {
  291. return nil
  292. }
  293. inputID, inputKey := newFileEntry(input, info.Size())
  294. var err error
  295. var inputFile, conflictFile, sourceFile *os.File
  296. defer func() {
  297. if inputFile != nil {
  298. inputFile.Close()
  299. }
  300. }()
  301. defer func() {
  302. if conflictFile != nil {
  303. conflictFile.Close()
  304. }
  305. }()
  306. defer func() {
  307. if sourceFile != nil {
  308. sourceFile.Close()
  309. }
  310. }()
  311. // Skip dummy matches.
  312. v, ok := entries[inputKey]
  313. for ok && v.sourceID == nil && err != io.EOF {
  314. err = rollingChecksum(&inputID, &inputKey, &inputFile)
  315. if err != nil && err != io.EOF {
  316. log.Println(err)
  317. return nil
  318. }
  319. v, ok = entries[inputKey]
  320. }
  321. if ok && v.sourceID == nil {
  322. log.Printf("Target duplicate match: %v (%x)\n", inputID.path, inputKey.hash)
  323. return nil
  324. } else if ok && v.targetID != nil && v.targetID == &unsolvable {
  325. // Unresolved conflict happened previously.
  326. log.Printf("Target duplicate: %v (%x), source match: %v\n", inputID.path, inputKey.hash, v.sourceID.path)
  327. return nil
  328. } else if !ok {
  329. // No matching file in source.
  330. return nil
  331. } else if v.targetID == nil {
  332. // First match.
  333. entries[inputKey] = fileMatch{sourceID: entries[inputKey].sourceID, targetID: &inputID}
  334. return nil
  335. }
  336. // Else there is a conflict.
  337. sourceKey := inputKey
  338. sourceID := entries[inputKey].sourceID
  339. conflictKey := inputKey
  340. conflictID := entries[inputKey].targetID
  341. for inputKey == conflictKey && inputKey == sourceKey && err == nil {
  342. // Set dummy value to mark the key as visited for future files.
  343. entries[inputKey] = fileMatch{}
  344. // Since we change folders, we don't have to store the root in fileID, nor
  345. // we have to compute sourceRoot's realpath to open the file from this
  346. // point.
  347. os.Chdir(oldroot)
  348. os.Chdir(sourceRoot)
  349. err = rollingChecksum(sourceID, &sourceKey, &sourceFile)
  350. os.Chdir(oldroot)
  351. os.Chdir(root)
  352. if err != nil && err != io.EOF {
  353. // Read error. Drop all entries.
  354. log.Println(err)
  355. return nil
  356. }
  357. err = rollingChecksum(&inputID, &inputKey, &inputFile)
  358. inputErr := err
  359. if err != nil && err != io.EOF {
  360. // Read error. Drop input.
  361. log.Println(err)
  362. // We don't break now as there is still a chance that the conflicting
  363. // file matches the source.
  364. }
  365. err = rollingChecksum(conflictID, &conflictKey, &conflictFile)
  366. if err != nil && err != io.EOF {
  367. // Read error. We will replace conflict with input if the latter has
  368. // been read correctly.
  369. log.Println(err)
  370. break
  371. }
  372. if inputErr != nil && inputErr != io.EOF {
  373. break
  374. }
  375. }
  376. if inputKey == sourceKey && inputKey == conflictKey && err == io.EOF {
  377. log.Printf("Target duplicate: %v (%x), source match: %v\n", inputID.path, inputKey.hash, v.sourceID.path)
  378. log.Printf("Target duplicate: %v (%x), source match: %v\n", conflictID.path, conflictKey.hash, v.sourceID.path)
  379. // We mark the source file with an unresolved conflict for future target files.
  380. entries[sourceKey] = fileMatch{sourceID: sourceID, targetID: &unsolvable}
  381. } else if inputKey == sourceKey && inputKey != conflictKey {
  382. // Resolution: drop conflicting entry.
  383. entries[sourceKey] = fileMatch{sourceID: sourceID, targetID: &inputID}
  384. } else if conflictKey == sourceKey && conflictKey != inputKey {
  385. // Resolution: drop input entry.
  386. entries[sourceKey] = fileMatch{sourceID: sourceID, targetID: conflictID}
  387. } else if conflictKey != sourceKey && inputKey != sourceKey {
  388. // Resolution: drop both entries.
  389. entries[sourceKey] = fileMatch{sourceID: sourceID}
  390. }
  391. // Else we drop all entries.
  392. return nil
  393. }
  394. filepath.Walk(".", visitor)
  395. }
  396. // Rename files as specified in renameOps.
  397. // Chains and cycles may occur. See the implementation details.
  398. func processRenames(root string, renameOps, reverseOps map[string]string, clobber bool) {
  399. // Change folder since the renames are made relatively to 'root'.
  400. oldroot, err := os.Getwd()
  401. if err != nil {
  402. log.Fatal(err)
  403. }
  404. err = os.Chdir(root)
  405. if err != nil {
  406. log.Fatal(err)
  407. }
  408. defer os.Chdir(oldroot)
  409. for oldpath, newpath := range renameOps {
  410. if oldpath == newpath {
  411. continue
  412. }
  413. cycleMarker := oldpath
  414. // Go forward to the end of the chain or the cycle.
  415. for newpath != cycleMarker {
  416. _, ok := renameOps[newpath]
  417. if !ok {
  418. break
  419. }
  420. oldpath = newpath
  421. newpath = renameOps[newpath]
  422. }
  423. // Break the cycle if any.
  424. tmp := ""
  425. if cycleMarker == newpath {
  426. f, err := ioutil.TempFile(".", APPLICATION)
  427. if err != nil {
  428. log.Fatal(err)
  429. }
  430. newpath = f.Name()
  431. tmp = newpath
  432. f.Close()
  433. delete(reverseOps, cycleMarker)
  434. }
  435. // Process the chain of renames. Renaming can still fail, in which case we
  436. // output the error and go on with the chain.
  437. for oldpath != "" {
  438. err = os.MkdirAll(filepath.Dir(newpath), 0777)
  439. if err != nil {
  440. log.Println(err)
  441. } else {
  442. var err error
  443. if clobber {
  444. err = os.Rename(oldpath, newpath)
  445. if err != nil {
  446. log.Println(err)
  447. }
  448. } else {
  449. err = os.Link(oldpath, newpath)
  450. if err != nil {
  451. log.Println(err)
  452. } else {
  453. err = os.Remove(oldpath)
  454. if err != nil {
  455. log.Println(err)
  456. }
  457. }
  458. }
  459. if err == nil {
  460. log.Printf("Rename '%v' -> '%v'", oldpath, newpath)
  461. }
  462. }
  463. // Go backward.
  464. delete(renameOps, oldpath)
  465. newpath = oldpath
  466. oldpath = reverseOps[oldpath]
  467. }
  468. // Restore the cycle if needs be.
  469. if tmp != "" {
  470. err = os.Rename(tmp, cycleMarker)
  471. if err != nil {
  472. log.Println(err)
  473. } else {
  474. log.Printf("Rename '%v' -> '%v'", tmp, cycleMarker)
  475. }
  476. }
  477. }
  478. }
  479. func init() {
  480. log.SetFlags(0)
  481. }
  482. func main() {
  483. flag.Usage = func() {
  484. fmt.Fprintf(os.Stderr, "Usage: %v SOURCE TARGET\n\n", os.Args[0])
  485. fmt.Fprintln(os.Stderr, usage)
  486. fmt.Fprintln(os.Stderr, "Options:")
  487. flag.PrintDefaults()
  488. }
  489. var flagClobber = flag.Bool("f", false, "Overwrite existing files in TARGETS.")
  490. var flagProcess = flag.Bool("p", false, "Rename the files in TARGETS.")
  491. var flagVersion = flag.Bool("v", false, "Print version and exit.")
  492. flag.Parse()
  493. if *flagVersion {
  494. fmt.Println(APPLICATION, VERSION, COPYRIGHT)
  495. return
  496. }
  497. if flag.Arg(0) == "" || flag.Arg(1) == "" {
  498. flag.Usage()
  499. return
  500. }
  501. renameOps := make(map[string]string)
  502. reverseOps := make(map[string]string)
  503. s, err := os.Stat(flag.Arg(0))
  504. if err != nil {
  505. log.Fatal(err)
  506. }
  507. if s.IsDir() {
  508. entries := make(map[partialHash]fileMatch)
  509. log.Printf(":: Analyzing '%v'", flag.Arg(0))
  510. visitSource(flag.Arg(0), entries)
  511. log.Printf(":: Analyzing '%v'", flag.Arg(1))
  512. visitTarget(flag.Arg(1), flag.Arg(0), entries)
  513. for _, v := range entries {
  514. if v.targetID != nil && v.targetID != &unsolvable && v.targetID.path != v.sourceID.path {
  515. renameOps[v.targetID.path] = v.sourceID.path
  516. reverseOps[v.sourceID.path] = v.targetID.path
  517. }
  518. }
  519. } else {
  520. buf, err := ioutil.ReadFile(flag.Arg(0))
  521. if err != nil {
  522. log.Fatal(err)
  523. }
  524. err = json.Unmarshal(buf, &renameOps)
  525. if err != nil {
  526. log.Fatal(err)
  527. }
  528. for oldpath, newpath := range renameOps {
  529. if oldpath == newpath {
  530. delete(renameOps, oldpath)
  531. continue
  532. }
  533. _, err := os.Stat(flag.Arg(1) + SEPARATOR + oldpath)
  534. if err != nil && os.IsNotExist(err) {
  535. // Remove non-existing entries.
  536. delete(renameOps, oldpath)
  537. continue
  538. }
  539. reverseOps[newpath] = oldpath
  540. }
  541. }
  542. if *flagProcess {
  543. log.Println(":: Processing renames")
  544. processRenames(flag.Arg(1), renameOps, reverseOps, *flagClobber)
  545. } else {
  546. log.Println(":: Previewing renames")
  547. // There should be no error.
  548. buf, _ := json.MarshalIndent(renameOps, "", "\t")
  549. os.Stdout.Write(buf)
  550. fmt.Println()
  551. }
  552. }