import os import difflib # Target directory NPC_DIR = os.path.abspath("assets/slike/glavna_referenca/_FOLDER_NPC") def find_fuzzy_duplicates(): print(f"šŸ•µļøā€ā™‚ļø Analyzing similar files in {os.path.basename(NPC_DIR)}...") files = {} # filename -> filepath for f in os.listdir(NPC_DIR): if f.startswith("."): continue files[f] = os.path.join(NPC_DIR, f) filenames = list(files.keys()) files_to_remove = set() # Compare names count = 0 total = len(filenames) # Simple check: Name containment (e.g. "gronk.png" vs "gronk_01.png") # sorted by length so we check shorter names against longer ones sorted_names = sorted(filenames, key=len) for i in range(len(sorted_names)): name1 = sorted_names[i] stem1, ext1 = os.path.splitext(name1) path1 = files[name1] # Skip if already marked if path1 in files_to_remove: continue for j in range(i + 1, len(sorted_names)): name2 = sorted_names[j] stem2, ext2 = os.path.splitext(name2) path2 = files[name2] if path2 in files_to_remove: continue # RULE 1: If stem1 is contained in stem2 (e.g. "image" in "image_copy") # AND extensions match or are compatible image types if stem1 in stem2: # Check similarity ratio to avoid false positives like "man" in "woman" if difflib.SequenceMatcher(None, stem1, stem2).ratio() > 0.8 or stem2.startswith(stem1): # Potential duplicate! # Strategy: Keep the one with clearer name or larger size? # Let's keep LARGER file usually (better quality) size1 = os.path.getsize(path1) size2 = os.path.getsize(path2) print(f" āš ļø Potential dupe: '{name1}' ({size1}b) vs '{name2}' ({size2}b)") # REMOVE THE SMALLER ONE if size1 >= size2: print(f" šŸ—‘ļø Deleting smaller/same: {name2}") files_to_remove.add(path2) else: print(f" šŸ—‘ļø Deleting smaller: {name1}") files_to_remove.add(path1) break # name1 is gone, stop checking it # DO THE DELETION print(f"\nšŸ—‘ļø Deleting {len(files_to_remove)} files...") for p in files_to_remove: try: os.remove(p) except OSError as e: print(f"Error removing {p}: {e}") print("✨ Fuzzy cleanup done.") if __name__ == "__main__": find_fuzzy_duplicates()