Files
novafarma/scripts/fuzzy_dedupe_npc.py

74 lines
2.7 KiB
Python

import os
import difflib
# Target directory
NPC_DIR = os.path.abspath("assets/slike/glavna_referenca/_FOLDER_NPC")
def find_fuzzy_duplicates():
print(f"🕵️‍♂️ Analyzing similar files in {os.path.basename(NPC_DIR)}...")
files = {} # filename -> filepath
for f in os.listdir(NPC_DIR):
if f.startswith("."): continue
files[f] = os.path.join(NPC_DIR, f)
filenames = list(files.keys())
files_to_remove = set()
# Compare names
count = 0
total = len(filenames)
# Simple check: Name containment (e.g. "gronk.png" vs "gronk_01.png")
# sorted by length so we check shorter names against longer ones
sorted_names = sorted(filenames, key=len)
for i in range(len(sorted_names)):
name1 = sorted_names[i]
stem1, ext1 = os.path.splitext(name1)
path1 = files[name1]
# Skip if already marked
if path1 in files_to_remove: continue
for j in range(i + 1, len(sorted_names)):
name2 = sorted_names[j]
stem2, ext2 = os.path.splitext(name2)
path2 = files[name2]
if path2 in files_to_remove: continue
# RULE 1: If stem1 is contained in stem2 (e.g. "image" in "image_copy")
# AND extensions match or are compatible image types
if stem1 in stem2:
# Check similarity ratio to avoid false positives like "man" in "woman"
if difflib.SequenceMatcher(None, stem1, stem2).ratio() > 0.8 or stem2.startswith(stem1):
# Potential duplicate!
# Strategy: Keep the one with clearer name or larger size?
# Let's keep LARGER file usually (better quality)
size1 = os.path.getsize(path1)
size2 = os.path.getsize(path2)
print(f" ⚠️ Potential dupe: '{name1}' ({size1}b) vs '{name2}' ({size2}b)")
# REMOVE THE SMALLER ONE
if size1 >= size2:
print(f" 🗑️ Deleting smaller/same: {name2}")
files_to_remove.add(path2)
else:
print(f" 🗑️ Deleting smaller: {name1}")
files_to_remove.add(path1)
break # name1 is gone, stop checking it
# DO THE DELETION
print(f"\n🗑️ Deleting {len(files_to_remove)} files...")
for p in files_to_remove:
try:
os.remove(p)
except OSError as e:
print(f"Error removing {p}: {e}")
print("✨ Fuzzy cleanup done.")
if __name__ == "__main__":
find_fuzzy_duplicates()