cmd/hash_hardlink.py

81 lines
2.3 KiB
Python
Raw Normal View History

2021-01-14 10:35:43 +00:00
import argparse
2019-12-10 20:57:23 +00:00
import hashlib
import os
import send2trash
import sys
from voussoirkit import bytestring
2021-01-14 10:35:43 +00:00
from voussoirkit import lazychain
from voussoirkit import pathclass
2021-01-14 10:35:43 +00:00
from voussoirkit import pipeable
2019-12-10 20:57:23 +00:00
from voussoirkit import spinal
2021-01-14 10:35:43 +00:00
from voussoirkit import vlogging
log = vlogging.getLogger(__name__, 'hash_hardlink')
2019-12-10 20:57:23 +00:00
def hash_file(file):
hasher = hashlib.md5()
2020-09-21 01:27:28 +00:00
with file.open('rb') as handle:
2019-12-10 20:57:23 +00:00
while True:
chunk = handle.read(2**20)
if not chunk:
break
hasher.update(chunk)
return hasher.hexdigest()
2021-01-23 04:58:55 +00:00
@pipeable.ctrlc_return1
2021-01-14 10:35:43 +00:00
def hash_hardlink_argparse(args):
paths = [pathclass.Path(p) for p in pipeable.input_many(args.paths, strip=True, skip_blank=True)]
drives = set(path.stat.st_dev for path in paths)
2019-12-10 20:57:23 +00:00
if len(drives) != 1:
raise ValueError('All paths must be on the same drive.')
2021-01-14 10:35:43 +00:00
files = lazychain.LazyChain()
for path in paths:
if path.is_file:
files.append(path)
elif path.is_dir:
files.extend(spinal.walk(path))
2021-01-14 10:35:43 +00:00
2019-12-10 20:57:23 +00:00
inodes = set()
hashes = {}
if args.if_larger_than:
larger = bytestring.parsebytes(args.if_larger_than)
else:
larger = None
2021-01-14 10:35:43 +00:00
for file in files:
if file.stat.st_ino in inodes:
# This file is already a hardlink of another file we've seen.
continue
if larger is not None and file.size < larger:
continue
2021-01-14 10:35:43 +00:00
inodes.add(file.stat.st_ino)
h = hash_file(file)
print(file.absolute_path, h)
hashes.setdefault(h, []).append(file)
2019-12-10 20:57:23 +00:00
hashes = {h: files for (h, files) in hashes.items() if len(files) > 1}
for (h, files) in hashes.items():
leader = files.pop(0)
for follower in files:
print(f'{leader.absolute_path} -> {follower.absolute_path}')
send2trash.send2trash(follower.absolute_path)
os.link(leader.absolute_path, follower.absolute_path)
@vlogging.main_decorator
2021-01-14 10:35:43 +00:00
def main(argv):
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('paths', nargs='+')
parser.add_argument('--if_larger_than', '--if-larger-than', default=None)
2021-01-14 10:35:43 +00:00
parser.set_defaults(func=hash_hardlink_argparse)
args = parser.parse_args(argv)
return args.func(args)
2019-12-10 20:57:23 +00:00
if __name__ == '__main__':
raise SystemExit(main(sys.argv[1:]))