#!/usr/bin/env python3 """ SMART VISUAL CLEANUP Detects duplicates based on VISUAL CONTENT (pixels), not filenames. Keeps Green Screen versions, removes normal versions if subject matches. """ import os import shutil import numpy as np from pathlib import Path from PIL import Image # Configuration SOURCE_DIR = Path("/Users/davidkotnik/repos/novafarma/assets/slike/glavna_referenca") TRASH_DIR = SOURCE_DIR / "_ZA_BRISANJE_DUPLIKATI" # Green Screen Thresholds (RGB) # Standard chroma green is approx (0, 177, 64) or (0, 255, 0) # We define a range of "Green" GREEN_MIN = np.array([0, 80, 0]) GREEN_MAX = np.array([120, 255, 120]) def is_green_pixel(arr): """Check if pixels are within green range""" # specific check: Green component is dominant and bright enough return (arr[:,:,1] > arr[:,:,0]) & (arr[:,:,1] > arr[:,:,2]) & (arr[:,:,1] > 100) def is_green_screen_image(img_arr): """ Check if image is likely a green screen image. Strategy: Check corners and borders. """ h, w, _ = img_arr.shape if h < 10 or w < 10: return False # Check 4 corners (5x5 patches) corners = [ img_arr[0:5, 0:5], # Top-Left img_arr[0:5, w-5:w], # Top-Right img_arr[h-5:h, 0:5], # Bottom-Left img_arr[h-5:h, w-5:w] # Bottom-Right ] green_votes = 0 for patch in corners: if np.mean(is_green_pixel(patch)) > 0.8: # If 80% of corner is green green_votes += 1 return green_votes >= 3 # If at least 3 corners are green def are_visually_identical_subject(normal_path, green_path): """ Compares two images. Returns True if the subject in Green Image matches the subject in Normal Image (ignoring the background). """ try: # 1. Open and resize to speed up comparison (e.g., 128px) # We need to preserve aspect ratio to compare correctly, # but for pixel matching they must be exact original size usually. # Let's try matching headers first. img_n = Image.open(normal_path).convert('RGB') img_g = Image.open(green_path).convert('RGB') if img_n.size != img_g.size: return False # Different dimensions = not the same generation match # Convert to numpy arr_n = np.array(img_n) arr_g = np.array(img_g) # 2. Identify Green Mask in the Green Image green_mask = is_green_pixel(arr_g) # 3. Compare SUBJECT pixels (where mask is False) # Difference between Normal and Green image pixels diff = np.abs(arr_n.astype(int) - arr_g.astype(int)) diff_sum = np.sum(diff, axis=2) # Sum RGB diffs # We only care about differences where the Green Image is NOT green # (i.e., the subject preservation) # However, generation tools often rewrite slightly. # Let's check strict equality on the subject. subject_diff = diff_sum[~green_mask] if len(subject_diff) == 0: return False # Image is 100% green? # Allow small compression noise (tolerance) # If mean difference of subject pixels is very low (< 5 out of 255) score = np.mean(subject_diff) return score < 15.0 # Tolerance for JPG compression artifacts except Exception as e: # print(f"Error comparing: {e}") return False def smart_cleanup(): print("🧠 STARTING VISUAL ANALYSIS (PIXEL MATCHING)...") print(f"📂 Searching in: {SOURCE_DIR}") TRASH_DIR.mkdir(exist_ok=True) all_files = list(SOURCE_DIR.rglob("*.png")) + list(SOURCE_DIR.rglob("*.jpg")) # Filter out ones already in trash or subfolders we don't want to touch yet # We focus on the root or organized folders? Assuming recursive. print(f"🔍 Analyzing {len(all_files)} images...") green_images = [] normal_images = [] # 1. Classify Images print("🎨 Classifying Green Screen vs Normal...") for f in all_files: if "_ZA_BRISANJE" in str(f): continue try: img = Image.open(f).convert('RGB') arr = np.array(img) if is_green_screen_image(arr): green_images.append({'path': f, 'size': img.size}) else: normal_images.append({'path': f, 'size': img.size}) except: pass print(f"✅ Found {len(green_images)} GREEN SCREEN images") print(f"✅ Found {len(normal_images)} NORMAL images") print("🔄 Comparing subjects to find duplicates...") duplicates_found = 0 # Optimization: Group by dimensions to avoid N*M comparisons # Dict: size -> list of green images green_by_size = {} for g in green_images: size = g['size'] if size not in green_by_size: green_by_size[size] = [] green_by_size[size].append(g['path']) # 2. Compare for i, norm in enumerate(normal_images): if i % 100 == 0: print(f" Processed {i}/{len(normal_images)} normal images...") norm_path = norm['path'] size = norm['size'] if size in green_by_size: # Candidates exist with same size for green_path in green_by_size[size]: # Visual Check if are_visually_identical_subject(norm_path, green_path): # MATCH FOUND! # Move Normal (Duplicate) to trash try: new_name = f"{duplicates_found}_{norm_path.name}" dest = TRASH_DIR / new_name shutil.move(str(norm_path), str(dest)) # print(f" 🗑️ Duplicate found! Moving {norm_path.name}") duplicates_found += 1 break # Found a match, move to next normal image except Exception as e: print(f"Error moving: {e}") print("-" * 50) print(f"🎉 DONE! Found and moved {duplicates_found} duplicates.") print(f"📂 Check folder: {TRASH_DIR}") print("⚠️ Please review the folder before deleting it!") if __name__ == "__main__": smart_cleanup()