novafarma/scripts/remove_exact_duplicates.py

import os
import hashlib
import shutil

TARGET_DIR = os.path.abspath("assets/slike/glavna_referenca")
TRASH_DIR = os.path.abspath("_TRASH_BIN/duplicates_referenca")

def get_file_hash(filepath):
    """Calculates MD5 hash of file content."""
    hasher = hashlib.md5()
    try:
        with open(filepath, 'rb') as f:
            buf = f.read(65536)
            while len(buf) > 0:
                hasher.update(buf)
                buf = f.read(65536)
        return hasher.hexdigest()
    except:
        return None

def remove_duplicates():
    if not os.path.exists(TARGET_DIR):
        print("❌ Target directory not found.")
        return

    if not os.path.exists(TRASH_DIR):
        os.makedirs(TRASH_DIR)

    print(f"🔍 Scanning for exact duplicates in {TARGET_DIR}...")

    unique_hashes = {} # hash -> filepath
    duplicates = 0
    scanned = 0

    files = [f for f in os.listdir(TARGET_DIR) if os.path.isfile(os.path.join(TARGET_DIR, f)) and not f.startswith(".")]
    total_files = len(files)

    for filename in files:
        filepath = os.path.join(TARGET_DIR, filename)
        file_hash = get_file_hash(filepath)

        if file_hash:
            if file_hash in unique_hashes:
                # Duplicate found!
                original = unique_hashes[file_hash]

                # Move to trash
                shutil.move(filepath, os.path.join(TRASH_DIR, filename))
                duplicates += 1

                # Optional: Print info
                # print(f"Duplicate: {filename} == {os.path.basename(original)}")
            else:
                # New unique file
                unique_hashes[file_hash] = filepath

        scanned += 1
        if scanned % 500 == 0:
            print(f"   Scanned {scanned}/{total_files} files...")

    print(f"✨ DONE! Found and moved {duplicates} duplicates to {TRASH_DIR}")
    print(f"✅ Unique files remaining: {len(unique_hashes)}")

if __name__ == "__main__":
    remove_duplicates()