novafarma/scripts/smart_green_cleanup.py

#!/usr/bin/env python3
"""
SMART VISUAL CLEANUP
Detects duplicates based on VISUAL CONTENT (pixels), not filenames.
Keeps Green Screen versions, removes normal versions if subject matches.
"""

import os
import shutil
import numpy as np
from pathlib import Path
from PIL import Image

# Configuration
SOURCE_DIR = Path("/Users/davidkotnik/repos/novafarma/assets/slike/glavna_referenca")
TRASH_DIR = SOURCE_DIR / "_ZA_BRISANJE_DUPLIKATI"

# Green Screen Thresholds (RGB)
# Standard chroma green is approx (0, 177, 64) or (0, 255, 0)
# We define a range of "Green"
GREEN_MIN = np.array([0, 80, 0])
GREEN_MAX = np.array([120, 255, 120])

def is_green_pixel(arr):
    """Check if pixels are within green range"""
    # specific check: Green component is dominant and bright enough
    return (arr[:,:,1] > arr[:,:,0]) & (arr[:,:,1] > arr[:,:,2]) & (arr[:,:,1] > 100)

def is_green_screen_image(img_arr):
    """
    Check if image is likely a green screen image.
    Strategy: Check corners and borders.
    """
    h, w, _ = img_arr.shape
    if h < 10 or w < 10: return False

    # Check 4 corners (5x5 patches)
    corners = [
        img_arr[0:5, 0:5],      # Top-Left
        img_arr[0:5, w-5:w],    # Top-Right
        img_arr[h-5:h, 0:5],    # Bottom-Left
        img_arr[h-5:h, w-5:w]   # Bottom-Right
    ]

    green_votes = 0
    for patch in corners:
        if np.mean(is_green_pixel(patch)) > 0.8: # If 80% of corner is green
            green_votes += 1

    return green_votes >= 3 # If at least 3 corners are green

def are_visually_identical_subject(normal_path, green_path):
    """
    Compares two images. Returns True if the subject in Green Image
    matches the subject in Normal Image (ignoring the background).
    """
    try:
        # 1. Open and resize to speed up comparison (e.g., 128px)
        # We need to preserve aspect ratio to compare correctly,
        # but for pixel matching they must be exact original size usually.
        # Let's try matching headers first.

        img_n = Image.open(normal_path).convert('RGB')
        img_g = Image.open(green_path).convert('RGB')

        if img_n.size != img_g.size:
            return False # Different dimensions = not the same generation match

        # Convert to numpy
        arr_n = np.array(img_n)
        arr_g = np.array(img_g)

        # 2. Identify Green Mask in the Green Image
        green_mask = is_green_pixel(arr_g)

        # 3. Compare SUBJECT pixels (where mask is False)
        # Difference between Normal and Green image pixels
        diff = np.abs(arr_n.astype(int) - arr_g.astype(int))
        diff_sum = np.sum(diff, axis=2) # Sum RGB diffs

        # We only care about differences where the Green Image is NOT green
        # (i.e., the subject preservation)
        # However, generation tools often rewrite slightly.
        # Let's check strict equality on the subject.

        subject_diff = diff_sum[~green_mask]

        if len(subject_diff) == 0:
            return False # Image is 100% green?

        # Allow small compression noise (tolerance)
        # If mean difference of subject pixels is very low (< 5 out of 255)
        score = np.mean(subject_diff)

        return score < 15.0 # Tolerance for JPG compression artifacts

    except Exception as e:
        # print(f"Error comparing: {e}")
        return False

def smart_cleanup():
    print("🧠 STARTING VISUAL ANALYSIS (PIXEL MATCHING)...")
    print(f"📂 Searching in: {SOURCE_DIR}")

    TRASH_DIR.mkdir(exist_ok=True)

    all_files = list(SOURCE_DIR.rglob("*.png")) + list(SOURCE_DIR.rglob("*.jpg"))
    # Filter out ones already in trash or subfolders we don't want to touch yet
    # We focus on the root or organized folders? Assuming recursive.

    print(f"🔍 Analyzing {len(all_files)} images...")

    green_images = []
    normal_images = []

    # 1. Classify Images
    print("🎨 Classifying Green Screen vs Normal...")
    for f in all_files:
        if "_ZA_BRISANJE" in str(f): continue

        try:
            img = Image.open(f).convert('RGB')
            arr = np.array(img)
            if is_green_screen_image(arr):
                green_images.append({'path': f, 'size': img.size})
            else:
                normal_images.append({'path': f, 'size': img.size})
        except:
            pass

    print(f"✅ Found {len(green_images)} GREEN SCREEN images")
    print(f"✅ Found {len(normal_images)} NORMAL images")
    print("🔄 Comparing subjects to find duplicates...")

    duplicates_found = 0

    # Optimization: Group by dimensions to avoid N*M comparisons
    # Dict: size -> list of green images
    green_by_size = {}
    for g in green_images:
        size = g['size']
        if size not in green_by_size: green_by_size[size] = []
        green_by_size[size].append(g['path'])

    # 2. Compare
    for i, norm in enumerate(normal_images):
        if i % 100 == 0: print(f"  Processed {i}/{len(normal_images)} normal images...")

        norm_path = norm['path']
        size = norm['size']

        if size in green_by_size:
            # Candidates exist with same size
            for green_path in green_by_size[size]:
                # Visual Check
                if are_visually_identical_subject(norm_path, green_path):
                    # MATCH FOUND!
                    # Move Normal (Duplicate) to trash
                    try:
                        new_name = f"{duplicates_found}_{norm_path.name}"
                        dest = TRASH_DIR / new_name
                        shutil.move(str(norm_path), str(dest))
                        # print(f"  🗑️  Duplicate found! Moving {norm_path.name}")
                        duplicates_found += 1
                        break # Found a match, move to next normal image
                    except Exception as e:
                        print(f"Error moving: {e}")

    print("-" * 50)
    print(f"🎉 DONE! Found and moved {duplicates_found} duplicates.")
    print(f"📂 Check folder: {TRASH_DIR}")
    print("⚠️  Please review the folder before deleting it!")

if __name__ == "__main__":
    smart_cleanup()