2021-01-14 10:35:43 +00:00
|
|
|
import argparse
|
2019-12-10 20:57:23 +00:00
|
|
|
import hashlib
|
|
|
|
import os
|
|
|
|
import send2trash
|
|
|
|
import sys
|
|
|
|
|
2021-05-25 08:37:50 +00:00
|
|
|
from voussoirkit import bytestring
|
2021-01-14 10:35:43 +00:00
|
|
|
from voussoirkit import lazychain
|
2020-01-29 03:34:33 +00:00
|
|
|
from voussoirkit import pathclass
|
2021-01-14 10:35:43 +00:00
|
|
|
from voussoirkit import pipeable
|
|
|
|
from voussoirkit import vlogging
|
|
|
|
|
2021-01-29 00:51:34 +00:00
|
|
|
log = vlogging.getLogger(__name__, 'hash_hardlink')
|
2019-12-10 20:57:23 +00:00
|
|
|
|
|
|
|
def hash_file(file):
|
|
|
|
hasher = hashlib.md5()
|
2020-09-21 01:27:28 +00:00
|
|
|
with file.open('rb') as handle:
|
2019-12-10 20:57:23 +00:00
|
|
|
while True:
|
|
|
|
chunk = handle.read(2**20)
|
|
|
|
if not chunk:
|
|
|
|
break
|
|
|
|
hasher.update(chunk)
|
|
|
|
return hasher.hexdigest()
|
|
|
|
|
2021-01-23 04:58:55 +00:00
|
|
|
@pipeable.ctrlc_return1
|
2021-01-14 10:35:43 +00:00
|
|
|
def hash_hardlink_argparse(args):
|
2022-01-20 03:52:23 +00:00
|
|
|
patterns = pipeable.input_many(args.patterns, strip=True, skip_blank=True)
|
|
|
|
paths = list(pathclass.glob_many(patterns))
|
2021-01-14 10:35:43 +00:00
|
|
|
drives = set(path.stat.st_dev for path in paths)
|
2019-12-10 20:57:23 +00:00
|
|
|
if len(drives) != 1:
|
|
|
|
raise ValueError('All paths must be on the same drive.')
|
|
|
|
|
2021-01-14 10:35:43 +00:00
|
|
|
files = lazychain.LazyChain()
|
|
|
|
for path in paths:
|
|
|
|
if path.is_file:
|
|
|
|
files.append(path)
|
2022-01-20 03:52:23 +00:00
|
|
|
else:
|
|
|
|
files.extend(path.walk_files())
|
|
|
|
|
|
|
|
files = (file for file in files if file.size >= args.if_larger_than)
|
2021-01-14 10:35:43 +00:00
|
|
|
|
2019-12-10 20:57:23 +00:00
|
|
|
inodes = set()
|
|
|
|
hashes = {}
|
|
|
|
|
2021-01-14 10:35:43 +00:00
|
|
|
for file in files:
|
|
|
|
if file.stat.st_ino in inodes:
|
|
|
|
# This file is already a hardlink of another file we've seen.
|
|
|
|
continue
|
|
|
|
inodes.add(file.stat.st_ino)
|
|
|
|
h = hash_file(file)
|
|
|
|
print(file.absolute_path, h)
|
|
|
|
hashes.setdefault(h, []).append(file)
|
2019-12-10 20:57:23 +00:00
|
|
|
|
|
|
|
hashes = {h: files for (h, files) in hashes.items() if len(files) > 1}
|
|
|
|
|
|
|
|
for (h, files) in hashes.items():
|
|
|
|
leader = files.pop(0)
|
|
|
|
for follower in files:
|
|
|
|
print(f'{leader.absolute_path} -> {follower.absolute_path}')
|
|
|
|
send2trash.send2trash(follower.absolute_path)
|
|
|
|
os.link(leader.absolute_path, follower.absolute_path)
|
|
|
|
|
2021-09-24 06:42:34 +00:00
|
|
|
return 0
|
|
|
|
|
2021-06-22 05:11:19 +00:00
|
|
|
@vlogging.main_decorator
|
2021-01-14 10:35:43 +00:00
|
|
|
def main(argv):
|
|
|
|
parser = argparse.ArgumentParser(description=__doc__)
|
|
|
|
|
2022-01-20 03:52:23 +00:00
|
|
|
parser.add_argument('patterns', nargs='+')
|
|
|
|
parser.add_argument('--if_larger_than', '--if-larger-than', type=bytestring.parsebytes, default=-1)
|
2021-01-14 10:35:43 +00:00
|
|
|
parser.set_defaults(func=hash_hardlink_argparse)
|
|
|
|
|
|
|
|
args = parser.parse_args(argv)
|
|
|
|
return args.func(args)
|
|
|
|
|
2019-12-10 20:57:23 +00:00
|
|
|
if __name__ == '__main__':
|
|
|
|
raise SystemExit(main(sys.argv[1:]))
|