Files
novafarma/scripts/smart_green_cleanup.py
David Kotnik 7ab6025486 🎨 Generated 10 Steampunk/Chibi Animals & Final Reference Org
EXTENDED SESSION (03:00 - 03:45 CET):

1. ANIMAL GENERATION (assets/slike/animals/generated_steampunk/):
    10 unique assets created:
   - Farm: Cow, Pig, Chicken, Duck, Goat, Horse, Rabbit, Donkey, Llama
   - Forest: Fox, Bear, Wolf
   - Style: Dark Noir Steampunk Chibi

2. REFERENCE ORGANIZATION (assets/slike/glavna_referenca/):
    Organized 2,626 files into subfolders
    Created comprehensive biome structure (200 folders)
    Moved docs to docs/art_guidelines/

SESSION UPDATE:
- Total Time: 3h 03min
- Files Processed: 5,788+
- Status: SESSION COMPLETE! 🚀
2026-01-20 10:45:44 +01:00

176 lines
6.2 KiB
Python

#!/usr/bin/env python3
"""
SMART VISUAL CLEANUP
Detects duplicates based on VISUAL CONTENT (pixels), not filenames.
Keeps Green Screen versions, removes normal versions if subject matches.
"""
import os
import shutil
import numpy as np
from pathlib import Path
from PIL import Image
# Configuration
SOURCE_DIR = Path("/Users/davidkotnik/repos/novafarma/assets/slike/glavna_referenca")
TRASH_DIR = SOURCE_DIR / "_ZA_BRISANJE_DUPLIKATI"
# Green Screen Thresholds (RGB)
# Standard chroma green is approx (0, 177, 64) or (0, 255, 0)
# We define a range of "Green"
GREEN_MIN = np.array([0, 80, 0])
GREEN_MAX = np.array([120, 255, 120])
def is_green_pixel(arr):
"""Check if pixels are within green range"""
# specific check: Green component is dominant and bright enough
return (arr[:,:,1] > arr[:,:,0]) & (arr[:,:,1] > arr[:,:,2]) & (arr[:,:,1] > 100)
def is_green_screen_image(img_arr):
"""
Check if image is likely a green screen image.
Strategy: Check corners and borders.
"""
h, w, _ = img_arr.shape
if h < 10 or w < 10: return False
# Check 4 corners (5x5 patches)
corners = [
img_arr[0:5, 0:5], # Top-Left
img_arr[0:5, w-5:w], # Top-Right
img_arr[h-5:h, 0:5], # Bottom-Left
img_arr[h-5:h, w-5:w] # Bottom-Right
]
green_votes = 0
for patch in corners:
if np.mean(is_green_pixel(patch)) > 0.8: # If 80% of corner is green
green_votes += 1
return green_votes >= 3 # If at least 3 corners are green
def are_visually_identical_subject(normal_path, green_path):
"""
Compares two images. Returns True if the subject in Green Image
matches the subject in Normal Image (ignoring the background).
"""
try:
# 1. Open and resize to speed up comparison (e.g., 128px)
# We need to preserve aspect ratio to compare correctly,
# but for pixel matching they must be exact original size usually.
# Let's try matching headers first.
img_n = Image.open(normal_path).convert('RGB')
img_g = Image.open(green_path).convert('RGB')
if img_n.size != img_g.size:
return False # Different dimensions = not the same generation match
# Convert to numpy
arr_n = np.array(img_n)
arr_g = np.array(img_g)
# 2. Identify Green Mask in the Green Image
green_mask = is_green_pixel(arr_g)
# 3. Compare SUBJECT pixels (where mask is False)
# Difference between Normal and Green image pixels
diff = np.abs(arr_n.astype(int) - arr_g.astype(int))
diff_sum = np.sum(diff, axis=2) # Sum RGB diffs
# We only care about differences where the Green Image is NOT green
# (i.e., the subject preservation)
# However, generation tools often rewrite slightly.
# Let's check strict equality on the subject.
subject_diff = diff_sum[~green_mask]
if len(subject_diff) == 0:
return False # Image is 100% green?
# Allow small compression noise (tolerance)
# If mean difference of subject pixels is very low (< 5 out of 255)
score = np.mean(subject_diff)
return score < 15.0 # Tolerance for JPG compression artifacts
except Exception as e:
# print(f"Error comparing: {e}")
return False
def smart_cleanup():
print("🧠 STARTING VISUAL ANALYSIS (PIXEL MATCHING)...")
print(f"📂 Searching in: {SOURCE_DIR}")
TRASH_DIR.mkdir(exist_ok=True)
all_files = list(SOURCE_DIR.rglob("*.png")) + list(SOURCE_DIR.rglob("*.jpg"))
# Filter out ones already in trash or subfolders we don't want to touch yet
# We focus on the root or organized folders? Assuming recursive.
print(f"🔍 Analyzing {len(all_files)} images...")
green_images = []
normal_images = []
# 1. Classify Images
print("🎨 Classifying Green Screen vs Normal...")
for f in all_files:
if "_ZA_BRISANJE" in str(f): continue
try:
img = Image.open(f).convert('RGB')
arr = np.array(img)
if is_green_screen_image(arr):
green_images.append({'path': f, 'size': img.size})
else:
normal_images.append({'path': f, 'size': img.size})
except:
pass
print(f"✅ Found {len(green_images)} GREEN SCREEN images")
print(f"✅ Found {len(normal_images)} NORMAL images")
print("🔄 Comparing subjects to find duplicates...")
duplicates_found = 0
# Optimization: Group by dimensions to avoid N*M comparisons
# Dict: size -> list of green images
green_by_size = {}
for g in green_images:
size = g['size']
if size not in green_by_size: green_by_size[size] = []
green_by_size[size].append(g['path'])
# 2. Compare
for i, norm in enumerate(normal_images):
if i % 100 == 0: print(f" Processed {i}/{len(normal_images)} normal images...")
norm_path = norm['path']
size = norm['size']
if size in green_by_size:
# Candidates exist with same size
for green_path in green_by_size[size]:
# Visual Check
if are_visually_identical_subject(norm_path, green_path):
# MATCH FOUND!
# Move Normal (Duplicate) to trash
try:
new_name = f"{duplicates_found}_{norm_path.name}"
dest = TRASH_DIR / new_name
shutil.move(str(norm_path), str(dest))
# print(f" 🗑️ Duplicate found! Moving {norm_path.name}")
duplicates_found += 1
break # Found a match, move to next normal image
except Exception as e:
print(f"Error moving: {e}")
print("-" * 50)
print(f"🎉 DONE! Found and moved {duplicates_found} duplicates.")
print(f"📂 Check folder: {TRASH_DIR}")
print("⚠️ Please review the folder before deleting it!")
if __name__ == "__main__":
smart_cleanup()