From a1b28d29dfe0d6d1289d8444944a2a49334f9eeb Mon Sep 17 00:00:00 2001 From: donmor Date: Tue, 23 Apr 2024 17:33:29 +0800 Subject: [PATCH 1/7] Update mergerfs.dedup --- src/mergerfs.dedup | 61 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/src/mergerfs.dedup b/src/mergerfs.dedup index 451ede6..bf0ead2 100755 --- a/src/mergerfs.dedup +++ b/src/mergerfs.dedup @@ -24,11 +24,24 @@ import os import random import shlex import sys +import zlib +# pseudo hasher for crc32 / adler32 +class p_hasher: + _alg = None + _data = bytearray() + def __init__(self, alg='crc32'): + self._alg = zlib.adler32 if alg == 'adler32' else zlib.crc32 + def update(self, data): + self._data.extend(data) + def hexdigest(self): + return _alg(bytes(_data)) + _libc = ctypes.CDLL("libc.so.6",use_errno=True) _lgetxattr = _libc.lgetxattr _lgetxattr.argtypes = [ctypes.c_char_p,ctypes.c_char_p,ctypes.c_void_p,ctypes.c_size_t] +_hashers = [hashlib.md5()] def lgetxattr(path,name): if type(path) == str: path = path.encode(errors='backslashreplace') @@ -184,17 +197,23 @@ def size_any(size,stats): return any([st.st_size == size for (path,st) in stats]) -def md5sums_all(stats): +def hashes_all(stats): if size_all(stats): - hashval = hash_file(stats[0][0]) - return all(hash_file(path) == hashval for (path,st) in stats[1:]) + for hasher in _hashers: + hashval = hash_file(stats[0][0]) + if not all(hash_file(path) == hashval for (path,st) in stats[1:]): + return False + return True return False -def short_md5sums_all(stats): +def short_hashes_all(stats): if size_all(stats): - hashval = short_hash_file(stats[0][0]) - return all(short_hash_file(path) == hashval for (path,st) in stats[1:]) + for hasher in _hashers: + hashval = short_hash_file(stats[0][0], hasher) + if not all(short_hash_file(path, hasher) == hashval for (path,st) in stats[1:]): + return False + return True return False @@ -323,10 +342,10 @@ def get_ignorefun(name): 'diff-time': lambda x: not mtime_all(x), 'same-size': size_all, 'diff-size': lambda x: not size_all(x), - 'same-hash': md5sums_all, - 'diff-hash': lambda x: not md5sums_all(x), - 'same-short-hash': short_md5sums_all, - 'diff-short-hash': lambda x: not short_md5sums_all(x) + 'same-hash': hashes_all, + 'diff-hash': lambda x: not hashes_all(x), + 'same-short-hash': short_hashes_all, + 'diff-short-hash': lambda x: not short_hashes_all(x) } return funs[name] @@ -421,12 +440,18 @@ optional arguments: * diff-size : have different sizes * same-time : have the same mtime * diff-time : have different mtimes - * same-hash : have the same md5sum - * diff-hash : have different md5sums - * same-short-hash : have the same short md5sums - * diff-short-hash : have different short md5sums + * same-hash : have the same hashes + * diff-hash : have different hashes + * same-short-hash : have the same short hashes + * diff-short-hash : have different short hashes 'hash' is expensive. 'short-hash' far less expensive, not as safe, but pretty good. + -h, --hash= Hashers used with -i (default: md5). Possible to + use multiple hashers in turn (separated by commas). + Available values: sha1, sha224, sha256, sha384, + sha512, sha3_224, sha3_256, sha3_384, sha3_512, + shake_128, shake_256, blake2b, blake2, md5, + adler32, crc32. -d, --dedup= What file to *keep* (default: mergerfs) * manual : ask user * oldest : file with smallest mtime @@ -469,6 +494,12 @@ def buildargparser(): 'same-hash','diff-hash', 'same-short-hash', 'diff-short-hash']) + parser.add_argument('-h','--hash', + choices=['sha1', 'sha224', 'sha256', 'sha384', + 'sha512', 'sha3_224', 'sha3_256', + 'sha3_384', 'sha3_512', 'shake_128', + 'shake_256', 'blake2b', 'blake2', + 'md5', 'adler32', 'crc32']) parser.add_argument('-d','--dedup', choices=['manual', 'oldest','newest', @@ -527,6 +558,8 @@ def main(): execute = args.execute includes = ['*'] if not args.include else args.include excludes = args.exclude + if args.hash: + _hashers = [p_hasher(s) if s in ['adler32', 'crc32'] else hashlib.new(s) for s in arg.hash.split(',')] total_size = 0 try: From 740d7e1715238ed5a53fca1993bfdeee5a154b8a Mon Sep 17 00:00:00 2001 From: donmor Date: Tue, 23 Apr 2024 17:36:23 +0800 Subject: [PATCH 2/7] Update README.md --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index 74f9199..2b9f5ab 100644 --- a/README.md +++ b/README.md @@ -133,6 +133,12 @@ optional arguments: * different-time : have different mtimes * same-hash : have the same md5sum * different-hash : have different md5sums + -h, --hash= Hashers used with -i (default: md5). Possible to + use multiple hashers in turn (separated by commas). + Available values: sha1, sha224, sha256, sha384, + sha512, sha3_224, sha3_256, sha3_384, sha3_512, + shake_128, shake_256, blake2b, blake2, md5, + adler32, crc32. -d, --dedup= What file to *keep* (default: newest) * manual : ask user * oldest : file with smallest mtime From 0895704fba1db8c6db57b3688e8ba6925ac1bc21 Mon Sep 17 00:00:00 2001 From: donmor Date: Wed, 24 Apr 2024 08:20:17 +0800 Subject: [PATCH 3/7] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2b9f5ab..9db4885 100644 --- a/README.md +++ b/README.md @@ -133,7 +133,7 @@ optional arguments: * different-time : have different mtimes * same-hash : have the same md5sum * different-hash : have different md5sums - -h, --hash= Hashers used with -i (default: md5). Possible to + -H, --hash= Hashers used with -i (default: md5). Possible to use multiple hashers in turn (separated by commas). Available values: sha1, sha224, sha256, sha384, sha512, sha3_224, sha3_256, sha3_384, sha3_512, From 6ae9b92f4dc79e762378db63234335fde3bea208 Mon Sep 17 00:00:00 2001 From: donmor Date: Wed, 24 Apr 2024 09:59:35 +0800 Subject: [PATCH 4/7] Update mergerfs.dedup --- src/mergerfs.dedup | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) diff --git a/src/mergerfs.dedup b/src/mergerfs.dedup index bf0ead2..e41f657 100755 --- a/src/mergerfs.dedup +++ b/src/mergerfs.dedup @@ -29,19 +29,19 @@ import zlib # pseudo hasher for crc32 / adler32 class p_hasher: - _alg = None - _data = bytearray() + _alg = '' + _dg = '' def __init__(self, alg='crc32'): self._alg = zlib.adler32 if alg == 'adler32' else zlib.crc32 def update(self, data): - self._data.extend(data) + self._dg += str(self._alg(data)) def hexdigest(self): - return _alg(bytes(_data)) + return self._dg _libc = ctypes.CDLL("libc.so.6",use_errno=True) _lgetxattr = _libc.lgetxattr _lgetxattr.argtypes = [ctypes.c_char_p,ctypes.c_char_p,ctypes.c_void_p,ctypes.c_size_t] -_hashers = [hashlib.md5()] +_hashers = ['md5'] def lgetxattr(path,name): if type(path) == str: path = path.encode(errors='backslashreplace') @@ -199,9 +199,11 @@ def size_any(size,stats): def hashes_all(stats): if size_all(stats): - for hasher in _hashers: - hashval = hash_file(stats[0][0]) - if not all(hash_file(path) == hashval for (path,st) in stats[1:]): + if not short_hashes_all(stats): + return False + for h in _hashers: + hashval = hash_file(stats[0][0], p_hasher(h) if h in ['adler32', 'crc32'] else hashlib.new(h)) + if not all(hash_file(path, p_hasher(h) if h in ['adler32', 'crc32'] else hashlib.new(h)) == hashval for (path,st) in stats[1:]): return False return True return False @@ -209,10 +211,10 @@ def hashes_all(stats): def short_hashes_all(stats): if size_all(stats): - for hasher in _hashers: - hashval = short_hash_file(stats[0][0], hasher) - if not all(short_hash_file(path, hasher) == hashval for (path,st) in stats[1:]): - return False + for h in _hashers: + hashval = short_hash_file(stats[0][0], p_hasher(h) if h in ['adler32', 'crc32'] else hashlib.new(h)) + if not all(short_hash_file(path, p_hasher(h) if h in ['adler32', 'crc32'] else hashlib.new(h)) == hashval for (path,st) in stats[1:]): + return False return True return False @@ -446,8 +448,8 @@ optional arguments: * diff-short-hash : have different short hashes 'hash' is expensive. 'short-hash' far less expensive, not as safe, but pretty good. - -h, --hash= Hashers used with -i (default: md5). Possible to - use multiple hashers in turn (separated by commas). + -H, --hash= Hashers used with -i (default: md5). + Can be used multiple times, used in turn. Available values: sha1, sha224, sha256, sha384, sha512, sha3_224, sha3_256, sha3_384, sha3_512, shake_128, shake_256, blake2b, blake2, md5, @@ -495,11 +497,14 @@ def buildargparser(): 'same-short-hash', 'diff-short-hash']) parser.add_argument('-h','--hash', + type=str, choices=['sha1', 'sha224', 'sha256', 'sha384', 'sha512', 'sha3_224', 'sha3_256', 'sha3_384', 'sha3_512', 'shake_128', 'shake_256', 'blake2b', 'blake2', - 'md5', 'adler32', 'crc32']) + 'md5', 'adler32', 'crc32'], + action='append', + default=[]) parser.add_argument('-d','--dedup', choices=['manual', 'oldest','newest', @@ -512,7 +517,6 @@ def buildargparser(): parser.add_argument('-e','--execute', action='store_true') parser.add_argument('-I','--include', - type=str, action='append', default=[]) parser.add_argument('-E','--exclude', @@ -559,7 +563,8 @@ def main(): includes = ['*'] if not args.include else args.include excludes = args.exclude if args.hash: - _hashers = [p_hasher(s) if s in ['adler32', 'crc32'] else hashlib.new(s) for s in arg.hash.split(',')] + global _hashers + _hashers = args.hash total_size = 0 try: From 2cf3e82e79cf07668738315f20b3400100412e4e Mon Sep 17 00:00:00 2001 From: donmor Date: Wed, 24 Apr 2024 10:00:07 +0800 Subject: [PATCH 5/7] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 9db4885..efede68 100644 --- a/README.md +++ b/README.md @@ -133,8 +133,8 @@ optional arguments: * different-time : have different mtimes * same-hash : have the same md5sum * different-hash : have different md5sums - -H, --hash= Hashers used with -i (default: md5). Possible to - use multiple hashers in turn (separated by commas). + -H, --hash= Hashers used with -i (default: md5). + Can be used multiple times, used in turn. Available values: sha1, sha224, sha256, sha384, sha512, sha3_224, sha3_256, sha3_384, sha3_512, shake_128, shake_256, blake2b, blake2, md5, From 672462c1b2113712bc9fc19fe91e94dd95840865 Mon Sep 17 00:00:00 2001 From: donmor Date: Wed, 24 Apr 2024 10:05:47 +0800 Subject: [PATCH 6/7] Update mergerfs.dedup --- src/mergerfs.dedup | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mergerfs.dedup b/src/mergerfs.dedup index e41f657..2d4fa12 100755 --- a/src/mergerfs.dedup +++ b/src/mergerfs.dedup @@ -29,7 +29,7 @@ import zlib # pseudo hasher for crc32 / adler32 class p_hasher: - _alg = '' + _alg = None _dg = '' def __init__(self, alg='crc32'): self._alg = zlib.adler32 if alg == 'adler32' else zlib.crc32 From 0f405740923bbf902d0406574ebf9a0b46f8959a Mon Sep 17 00:00:00 2001 From: donmor Date: Wed, 24 Apr 2024 10:08:44 +0800 Subject: [PATCH 7/7] Update mergerfs.dedup --- src/mergerfs.dedup | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/mergerfs.dedup b/src/mergerfs.dedup index 2d4fa12..dd24a40 100755 --- a/src/mergerfs.dedup +++ b/src/mergerfs.dedup @@ -496,15 +496,15 @@ def buildargparser(): 'same-hash','diff-hash', 'same-short-hash', 'diff-short-hash']) - parser.add_argument('-h','--hash', + parser.add_argument('-H','--hash', type=str, choices=['sha1', 'sha224', 'sha256', 'sha384', 'sha512', 'sha3_224', 'sha3_256', 'sha3_384', 'sha3_512', 'shake_128', 'shake_256', 'blake2b', 'blake2', 'md5', 'adler32', 'crc32'], - action='append', - default=[]) + action='append', + default=[]) parser.add_argument('-d','--dedup', choices=['manual', 'oldest','newest', @@ -517,6 +517,7 @@ def buildargparser(): parser.add_argument('-e','--execute', action='store_true') parser.add_argument('-I','--include', + type=str, action='append', default=[]) parser.add_argument('-E','--exclude', @@ -564,7 +565,7 @@ def main(): excludes = args.exclude if args.hash: global _hashers - _hashers = args.hash + _hashers = args.hash total_size = 0 try: