first commit

2026-06-28 20:21:40 +02:00
commit b66b065da1
16 changed files with 1097 additions and 0 deletions
@@ -0,0 +1,5 @@
+from .ocr import TweetOCRExtractor
+from .classifier import ZeroShotClassifier
+from .organizer import ImageThemeOrganizer
+
+__version__ = "1.0.0"
@@ -0,0 +1,51 @@
+import logging
+from typing import List, Dict, Any, Optional
+from transformers import pipeline
+
+logger = logging.getLogger(__name__)
+
+class ZeroShotClassifier:
+    """
+    Classe responsable de la classification de texte à l'aide de modèles Hugging Face Zero-Shot.
+    """
+    def __init__(self, model_name: str = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"):
+        """
+        Initialise le classifieur zero-shot.
+        :param model_name: Nom du modèle Hugging Face à utiliser.
+        """
+        self.model_name = model_name
+        self._pipeline = None
+
+    @property
+    def classifier_pipeline(self):
+        """
+        Initialisation tardive (lazy loading) du pipeline pour économiser de la mémoire et du temps au démarrage.
+        """
+        if self._pipeline is None:
+            logger.info(f"Chargement du pipeline de classification avec le modèle {self.model_name} (ceci peut prendre quelques secondes)...")
+            # On laisse Hugging Face gérer le choix du device (GPU s'il est dispo, sinon CPU)
+            self._pipeline = pipeline("zero-shot-classification", model=self.model_name)
+        return self._pipeline
+
+    def classify(self, text: str, candidate_labels: List[str] = None) -> Dict[str, Any]:
+        """
+        Classifie un texte selon une liste de catégories candidates.
+        Si aucune catégorie n'est fournie, utilise les catégories de harcèlement par défaut.
+        :param text: Le texte à classifier.
+        :param candidate_labels: Liste des catégories (labels).
+        :return: Dictionnaire contenant les labels et leurs scores associés.
+        """
+        if candidate_labels is None:
+            candidate_labels = ["Cyberharcèlement", "Insulte", "Menace", "Non-harcèlement"]
+
+        if not text or not text.strip():
+            # Si le texte est vide, on renvoie une structure vide ou par défaut
+            return {"labels": [], "scores": []}
+        
+        try:
+            # On exécute le pipeline de classification
+            result = self.classifier_pipeline(text, candidate_labels=candidate_labels)
+            return result
+        except Exception as e:
+            logger.error(f"Erreur lors de la classification du texte : {e}")
+            raise RuntimeError(f"Échec de la classification : {e}") from e
@@ -0,0 +1,134 @@
+import argparse
+import sys
+import logging
+from pathlib import Path
+from .organizer import ImageThemeOrganizer
+from .web_generator import WebReportGenerator
+
+def setup_logging(verbose: bool):
+    """
+    Configure la journalisation.
+    """
+    level = logging.DEBUG if verbose else logging.INFO
+    logging.basicConfig(
+        level=level,
+        format="%(asctime)s [%(levelname)s] %(name)s : %(message)s",
+        handlers=[
+            logging.StreamHandler(sys.stdout)
+        ]
+    )
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Classement automatique de captures d'écran de tweets par thèmes à l'aide d'EasyOCR et d'Hugging Face."
+    )
+    
+    parser.add_argument(
+        "-i", "--input-dir",
+        type=str,
+        default=None,
+        help="Chemin vers le dossier contenant les captures d'écran à classer."
+    )
+    
+    parser.add_argument(
+        "-o", "--output-dir",
+        type=str,
+        default=None,
+        help="Dossier de destination (par défaut, utilise ./ok/)."
+    )
+    
+    parser.add_argument(
+        "--db",
+        type=str,
+        default=str(Path(__file__).parent.parent / "captures/ok/tweets.csv"),
+        help="Chemin vers le fichier base de données CSV (défaut: 'captures/ok/tweets.csv')."
+    )
+    
+    parser.add_argument(
+        "--generate-report",
+        action="store_true",
+        help="Génère un rapport HTML à partir de la base de données CSV."
+    )
+    
+    parser.add_argument(
+        "-m", "--model",
+        type=str,
+        default="MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7",
+        help="Modèle Hugging Face Zero-Shot à utiliser."
+    )
+    
+    parser.add_argument(
+        "-l", "--languages",
+        type=str,
+        default="fr,en",
+        help="Langues pour EasyOCR, séparées par des virgules (défaut: 'fr,en')."
+    )
+    
+    parser.add_argument(
+        "-t", "--threshold",
+        type=float,
+        default=0.35,
+        help="Seuil de confiance de classification (0.0 à 1.0, défaut: 0.35)."
+    )
+    
+    parser.add_argument(
+        "--copy",
+        action="store_true",
+        help="Copie les images au lieu de les déplacer."
+    )
+    
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Exécute une simulation sans modifier ni déplacer les fichiers."
+    )
+    
+    parser.add_argument(
+        "-v", "--verbose",
+        action="store_true",
+        help="Active le mode verbeux pour afficher les logs de débogage."
+    )
+
+    args = parser.parse_args()
+
+    setup_logging(args.verbose)
+    
+    if args.generate_report:
+        generator = WebReportGenerator(Path(args.db))
+        generator.generate()
+        return
+
+    # Normal processing path
+    if not args.input_dir:
+        print("Erreur : --input-dir est requis pour le traitement.", file=sys.stderr)
+        sys.exit(1)
+
+    input_path = Path(args.input_dir)
+    if not input_path.exists():
+        print(f"Erreur : Le dossier d'entrée '{args.input_dir}' n'existe pas.", file=sys.stderr)
+        sys.exit(1)
+        
+    languages_list = [lang.strip() for lang in args.languages.split(",") if lang.strip()]
+
+    output_path = Path(args.output_dir) if args.output_dir else None
+
+    # Instanciation de l'organisateur
+    organizer = ImageThemeOrganizer(
+        input_dir=input_path,
+        output_dir=output_path,
+        ocr_languages=languages_list,
+        model_name=args.model,
+        confidence_threshold=args.threshold,
+        copy_only=args.copy,
+        dry_run=args.dry_run,
+        db_path=Path(args.db)
+    )
+
+    try:
+        organizer.run()
+    except Exception as e:
+        print(f"Une erreur critique est survenue : {e}", file=sys.stderr)
+        sys.exit(1)
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,62 @@
+import csv
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+
+class CSVDatabaseManager:
+    def __init__(self, db_path: Path):
+        self.db_path = db_path
+        self.fieldnames = ['filename', 'filepath', 'status', 'ocr_text', 'detected_category', 'confidence', 'created_at']
+        self._initialize_csv()
+
+    def _initialize_csv(self):
+        if not self.db_path.exists():
+            self.db_path.parent.mkdir(parents=True, exist_ok=True)
+            with open(self.db_path, 'w', newline='', encoding='utf-8') as f:
+                writer = csv.DictWriter(f, fieldnames=self.fieldnames)
+                writer.writeheader()
+
+    def _read_all(self) -> List[Dict[str, Any]]:
+        with open(self.db_path, 'r', newline='', encoding='utf-8') as f:
+            return list(csv.DictReader(f))
+
+    def _write_all(self, data: List[Dict[str, Any]]):
+        with open(self.db_path, 'w', newline='', encoding='utf-8') as f:
+            writer = csv.DictWriter(f, fieldnames=self.fieldnames)
+            writer.writeheader()
+            writer.writerows(data)
+
+    def add_files(self, file_paths: List[Path]):
+        data = self._read_all()
+        existing_filenames = {row['filename'] for row in data}
+        
+        new_entries = []
+        for path in file_paths:
+            if path.name not in existing_filenames:
+                new_entries.append({
+                    'filename': path.name,
+                    'filepath': str(path.absolute()),
+                    'status': 'pending',
+                    'ocr_text': '',
+                    'detected_category': '',
+                    'confidence': '',
+                    'created_at': '' # Could add timestamp here
+                })
+        
+        if new_entries:
+            data.extend(new_entries)
+            self._write_all(data)
+
+    def get_pending_files(self) -> List[Dict[str, Any]]:
+        return [row for row in self._read_all() if row['status'] == 'pending']
+
+    def update_file_status(self, filename: str, status: str, ocr_text: Optional[str] = None, category: Optional[str] = None, confidence: Optional[float] = None, new_filepath: Optional[str] = None):
+        data = self._read_all()
+        for row in data:
+            if row['filename'] == filename:
+                row['status'] = status
+                if ocr_text is not None: row['ocr_text'] = ocr_text
+                if category is not None: row['detected_category'] = category
+                if confidence is not None: row['confidence'] = str(confidence)
+                if new_filepath is not None: row['filepath'] = new_filepath
+                break
+        self._write_all(data)
@@ -0,0 +1,50 @@
+import logging
+from pathlib import Path
+from typing import List, Optional
+import easyocr
+
+logger = logging.getLogger(__name__)
+
+class TweetOCRExtractor:
+    """
+    Classe responsable de l'extraction de texte à partir de captures d'écran en utilisant EasyOCR.
+    """
+    def __init__(self, languages: Optional[List[str]] = None):
+        """
+        Initialise le lecteur EasyOCR.
+        :param languages: Liste des langues à charger (par défaut ['fr', 'en']).
+        """
+        if languages is None:
+            languages = ['fr', 'en']
+        self.languages = languages
+        self._reader = None
+
+    @property
+    def reader(self) -> easyocr.Reader:
+        """
+        Initialisation tardive (lazy loading) d'EasyOCR pour économiser de la mémoire si non utilisé.
+        """
+        if self._reader is None:
+            logger.info("Initialisation de l'OCR EasyOCR (ceci peut prendre quelques secondes)...")
+            self._reader = easyocr.Reader(self.languages)
+        return self._reader
+
+    def extract_text(self, image_path: Path) -> str:
+        """
+        Extrait le texte d'une image.
+        :param image_path: Chemin vers le fichier image.
+        :return: Texte brut extrait.
+        """
+        if not image_path.exists():
+            raise FileNotFoundError(f"Le fichier image n'existe pas : {image_path}")
+        
+        try:
+            # easyocr accepte un chemin de fichier sous forme de string
+            results = self.reader.readtext(str(image_path))
+            # On joint les blocs de texte détectés
+            text_blocks = [text for (_, text, _) in results]
+            extracted_text = " ".join(text_blocks).strip()
+            return extracted_text
+        except Exception as e:
+            logger.error(f"Erreur lors de l'extraction OCR sur {image_path.name} : {e}")
+            raise RuntimeError(f"Échec de l'OCR : {e}") from e
@@ -0,0 +1,152 @@
+import logging
+import shutil
+from pathlib import Path
+from typing import List, Optional
+from tqdm import tqdm
+
+from .ocr import TweetOCRExtractor
+from .classifier import ZeroShotClassifier
+from .database_manager import CSVDatabaseManager
+
+logger = logging.getLogger(__name__)
+
+# Extensions d'images supportées
+SUPPORTED_EXTENSIONS = {'.png', '.jpg', '.jpeg', '.webp', '.bmp', '.tiff'}
+
+class ImageThemeOrganizer:
+    """
+    Orchestrateur principal du traitement : parcourt les images, extrait le texte,
+    classifie, et organise les fichiers dans des sous-dossiers via une base de données.
+    """
+    def __init__(
+        self,
+        input_dir: Path,
+        output_dir: Optional[Path] = None,
+        ocr_languages: Optional[List[str]] = None,
+        model_name: str = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7",
+        confidence_threshold: float = 0.35,
+        copy_only: bool = False,
+        dry_run: bool = False,
+        db_path: Path = Path("captures/ok/tweets.csv")
+    ):
+        self.input_dir = Path(input_dir)
+        self.output_dir = Path(output_dir) if output_dir else self.input_dir / "ok"
+        self.confidence_threshold = confidence_threshold
+        self.copy_only = copy_only
+        self.dry_run = dry_run
+        
+        self.db = CSVDatabaseManager(db_path)
+
+        # Initialisation des modules (lazy)
+        self.ocr_extractor = TweetOCRExtractor(languages=ocr_languages)
+        self.classifier = ZeroShotClassifier(model_name=model_name)
+
+    def scan_new_files(self):
+        """
+        Trouve tous les fichiers images dans le dossier d'entrée et les ajoute à la base de données.
+        """
+        images = []
+        for file in self.input_dir.iterdir():
+            if file.is_file() and file.suffix.lower() in SUPPORTED_EXTENSIONS:
+                images.append(file)
+        
+        if images:
+            self.db.add_files(images)
+            logger.info(f"Ajout de {len(images)} fichiers à la base de données.")
+
+    def process_pending(self):
+        """
+        Traite tous les fichiers en attente dans la base de données.
+        """
+        pending_files = self.db.get_pending_files()
+        if not pending_files:
+            logger.info("Aucune image en attente.")
+            return
+
+        for record in tqdm(pending_files, desc="Traitement des tweets"):
+            self._process_record(record)
+
+    def _process_record(self, record: dict):
+        image_path = Path(record['filepath'])
+        logger.info(f"Traitement de l'image : {image_path.name}")
+        
+        try:
+            # 1. Extraction du texte
+            text = self.ocr_extractor.extract_text(image_path)
+            
+            dest_category = "Non-classifié"
+            
+            if not text:
+                logger.info(f" Aucun texte extrait pour {image_path.name}.")
+                dest_category = "Sans_Texte"
+                confidence = 1.0
+            else:
+                # 2. Classification du texte (Harassment categories)
+                classification_result = self.classifier.classify(text)
+                
+                # Récupérer la meilleure catégorie
+                if classification_result and classification_result.get("labels"):
+                    best_label = classification_result["labels"][0]
+                    best_score = classification_result["scores"][0]
+                    
+                    logger.info(f" Classification : {best_label} (score: {best_score:.2f})")
+                    
+                    if best_score >= self.confidence_threshold:
+                        dest_category = best_label
+                        confidence = best_score
+                    else:
+                        dest_category = "Inclassable"
+                        confidence = best_score
+                else:
+                    dest_category = "Inclassable"
+                    confidence = 0.0
+
+            # 3. Organisation physique du fichier
+            dest_path = self._organize_file(image_path, dest_category)
+
+            # 4. Mise à jour DB (on enregistre le nouvel emplacement du fichier
+            #    afin que le rapport HTML pointe vers l'image déplacée).
+            self.db.update_file_status(
+                record['filename'], 'processed', text, dest_category, confidence,
+                new_filepath=str(dest_path)
+            )
+
+        except Exception as e:
+            logger.error(f" Échec du traitement de {image_path.name} : {e}")
+            self.db.update_file_status(record['filename'], 'error')
+
+    def _organize_file(self, image_path: Path, category: str) -> Path:
+        """
+        Crée le dossier de destination et y déplace ou copie le fichier image.
+
+        Retourne le chemin de destination du fichier (utilisé pour la base de
+        données et le rapport HTML).
+        """
+        dest_dir = self.output_dir / category
+        dest_path = dest_dir / image_path.name
+
+        if self.dry_run:
+            logger.info(f"[DRY-RUN] Déplacer {image_path.name} -> {dest_dir.name}/")
+            return dest_path
+
+        # Créer le dossier de catégorie si nécessaire
+        dest_dir.mkdir(parents=True, exist_ok=True)
+
+        try:
+            if self.copy_only:
+                shutil.copy2(image_path, dest_path)
+            else:
+                shutil.move(image_path, dest_path)
+            logger.info(f" Organisé : {image_path.name} -> {dest_dir.name}/")
+        except Exception as e:
+            logger.error(f"Impossible d'organiser le fichier {image_path.name} vers {dest_dir} : {e}")
+
+        return dest_path
+
+    def run(self) -> dict:
+        """
+        Exécute le processus complet.
+        """
+        self.scan_new_files()
+        self.process_pending()
+        return {}
@@ -0,0 +1,353 @@
+import csv
+from pathlib import Path
+from datetime import datetime
+from urllib.parse import quote
+from collections import Counter
+from jinja2 import Template
+
+# Couleurs par catégorie (avec repli sur une couleur neutre).
+CATEGORY_COLORS = {
+    "Cyberharcèlement": "#8e44ad",
+    "Menace": "#c0392b",
+    "Insulte": "#d35400",
+    "Harcèlement": "#e74c3c",
+    "Non-harcèlement": "#27ae60",
+    "Sans_Texte": "#7f8c8d",
+    "Inclassable": "#95a5a6",
+    "Non-classifié": "#bdc3c7",
+}
+DEFAULT_COLOR = "#34495e"
+
+TEMPLATE = """
+<!DOCTYPE html>
+<html lang="fr">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Rapport de classification</title>
+    <style>
+        :root {
+            --bg: #f0f2f5;
+            --card-bg: #ffffff;
+            --text: #2c3e50;
+            --muted: #7f8c8d;
+        }
+        * { box-sizing: border-box; }
+        body {
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            margin: 0; padding: 0 20px 60px;
+            background-color: var(--bg); color: var(--text);
+        }
+        header {
+            position: sticky; top: 0; z-index: 50;
+            background: var(--bg); padding: 20px 0 12px;
+            border-bottom: 1px solid #dfe3e8;
+        }
+        h1 {
+            text-align: center; color: #1a2a6c;
+            margin: 0 0 4px; font-size: 1.8em;
+        }
+        .subtitle { text-align: center; color: var(--muted); margin: 0 0 16px; font-size: .9em; }
+
+        /* Barre de statistiques */
+        .stats {
+            display: flex; flex-wrap: wrap; gap: 10px;
+            justify-content: center; margin-bottom: 14px;
+        }
+        .stat {
+            background: var(--card-bg); border-radius: 10px; padding: 8px 16px;
+            box-shadow: 0 2px 6px rgba(0,0,0,.08); text-align: center; min-width: 90px;
+        }
+        .stat .num { font-size: 1.4em; font-weight: 700; }
+        .stat .lbl { font-size: .72em; color: var(--muted); text-transform: uppercase; letter-spacing: .5px; }
+
+        /* Contrôles : filtres, recherche, tri */
+        .controls {
+            display: flex; flex-wrap: wrap; gap: 10px;
+            align-items: center; justify-content: center;
+        }
+        .filters { display: flex; flex-wrap: wrap; gap: 8px; justify-content: center; }
+        .filter-btn {
+            border: none; cursor: pointer; color: #fff;
+            padding: 6px 14px; border-radius: 20px; font-size: .82em; font-weight: 600;
+            opacity: .55; transition: opacity .15s, transform .15s;
+        }
+        .filter-btn:hover { transform: translateY(-1px); }
+        .filter-btn.active { opacity: 1; box-shadow: 0 2px 8px rgba(0,0,0,.2); }
+        #search, #sort {
+            padding: 8px 12px; border: 1px solid #cfd6dd; border-radius: 8px;
+            font-size: .9em; background: #fff;
+        }
+        #search { min-width: 220px; }
+
+        /* Galerie */
+        .gallery {
+            display: grid; gap: 24px; margin-top: 24px;
+            grid-template-columns: repeat(auto-fill, minmax(330px, 1fr));
+        }
+        .card {
+            background: var(--card-bg); border-radius: 15px; overflow: hidden;
+            box-shadow: 0 8px 18px rgba(0,0,0,.08);
+            transition: transform .2s, box-shadow .2s;
+            display: flex; flex-direction: column;
+            border-top: 5px solid var(--cat-color, #ccc);
+        }
+        .card:hover { transform: translateY(-5px); box-shadow: 0 14px 28px rgba(0,0,0,.14); }
+        .img-container {
+            height: 240px; overflow: hidden; background: #1c2733;
+            display: flex; align-items: center; justify-content: center; cursor: zoom-in;
+        }
+        .img-container img { max-width: 100%; max-height: 100%; object-fit: contain; }
+        .card-body { padding: 16px 18px; display: flex; flex-direction: column; gap: 10px; flex-grow: 1; }
+        .card-head { display: flex; justify-content: space-between; align-items: center; gap: 8px; }
+        .category {
+            font-weight: 700; color: #fff; padding: 4px 12px;
+            border-radius: 20px; font-size: .8em; white-space: nowrap;
+        }
+        .filename { font-size: .78em; color: var(--muted); word-break: break-word; }
+
+        /* Barre de confiance */
+        .confidence { font-size: .78em; }
+        .confidence .bar-bg { background: #eceff1; border-radius: 6px; height: 8px; overflow: hidden; margin-top: 3px; }
+        .confidence .bar { height: 100%; border-radius: 6px; }
+        .conf-high { background: #27ae60; }
+        .conf-mid  { background: #f39c12; }
+        .conf-low  { background: #e74c3c; }
+
+        .ocr-text {
+            font-size: .82em; color: #444; background: #f7f9fa;
+            padding: 10px 12px; border-radius: 8px;
+            border-left: 4px solid var(--cat-color, #ccc);
+            max-height: 130px; overflow-y: auto; white-space: pre-wrap; line-height: 1.45;
+        }
+        .empty {
+            grid-column: 1 / -1; text-align: center; color: var(--muted);
+            padding: 60px 20px; font-size: 1.1em;
+        }
+
+        /* Lightbox */
+        #lightbox {
+            display: none; position: fixed; inset: 0; z-index: 100;
+            background: rgba(0,0,0,.88); align-items: center; justify-content: center;
+            cursor: zoom-out; padding: 30px;
+        }
+        #lightbox img { max-width: 95%; max-height: 95%; border-radius: 8px; box-shadow: 0 0 40px rgba(0,0,0,.6); }
+    </style>
+</head>
+<body>
+    <header>
+        <h1>Rapport de classification des tweets</h1>
+        <p class="subtitle">Généré le {{ generated_at }} — {{ items|length }} élément(s)</p>
+
+        <div class="stats">
+            <div class="stat"><div class="num">{{ items|length }}</div><div class="lbl">Total</div></div>
+            {% for cat, count in category_counts %}
+            <div class="stat">
+                <div class="num" style="color: {{ category_colors.get(cat, default_color) }}">{{ count }}</div>
+                <div class="lbl">{{ cat }}</div>
+            </div>
+            {% endfor %}
+        </div>
+
+        <div class="controls">
+            <div class="filters">
+                <button class="filter-btn active" data-cat="all" style="background:#34495e" onclick="filterCat(this,'all')">Tout</button>
+                {% for cat, count in category_counts %}
+                <button class="filter-btn active" data-cat="{{ cat }}"
+                        style="background: {{ category_colors.get(cat, default_color) }}"
+                        onclick="filterCat(this,'{{ cat }}')">{{ cat }} ({{ count }})</button>
+                {% endfor %}
+            </div>
+            <input id="search" type="text" placeholder="🔎 Rechercher dans le texte / fichier…" oninput="applyFilters()">
+            <select id="sort" onchange="sortCards()">
+                <option value="conf-desc">Confiance ↓</option>
+                <option value="conf-asc">Confiance ↑</option>
+                <option value="cat">Catégorie (A→Z)</option>
+                <option value="name">Nom de fichier</option>
+            </select>
+        </div>
+    </header>
+
+    <div class="gallery" id="gallery">
+        {% for item in items %}
+        <div class="card" data-category="{{ item.detected_category }}"
+             data-confidence="{{ item.confidence_value }}"
+             data-filename="{{ item.filename|lower }}"
+             data-text="{{ item.ocr_text|lower }}"
+             style="--cat-color: {{ category_colors.get(item.detected_category, default_color) }}">
+            <div class="img-container" onclick="openLightbox('{{ item.relative_filepath }}')">
+                <img src="{{ item.relative_filepath }}" alt="{{ item.filename }}" loading="lazy"
+                     onerror="this.parentElement.innerHTML='<span style=&quot;color:#bbb;font-size:.85em&quot;>Image introuvable</span>'">
+            </div>
+            <div class="card-body">
+                <div class="card-head">
+                    <span class="category" style="background: {{ category_colors.get(item.detected_category, default_color) }}">
+                        {{ item.detected_category }}
+                    </span>
+                    <span class="filename">{{ item.filename }}</span>
+                </div>
+                <div class="confidence">
+                    Confiance : <strong>{{ item.confidence_pct }}%</strong>
+                    <div class="bar-bg">
+                        <div class="bar {{ item.conf_class }}" style="width: {{ item.confidence_pct }}%"></div>
+                    </div>
+                </div>
+                <div class="ocr-text">{{ item.ocr_text or "— Aucun texte extrait —" }}</div>
+            </div>
+        </div>
+        {% else %}
+        <div class="empty">Aucun élément traité à afficher.<br>Lancez d'abord le traitement des images.</div>
+        {% endfor %}
+    </div>
+
+    <div id="lightbox" onclick="this.style.display='none'"><img id="lightbox-img" src="" alt=""></div>
+
+    <script>
+        const activeCats = new Set(['all']);
+
+        function filterCat(btn, cat) {
+            const buttons = document.querySelectorAll('.filter-btn');
+            if (cat === 'all') {
+                activeCats.clear(); activeCats.add('all');
+                buttons.forEach(b => b.classList.toggle('active', b.dataset.cat === 'all'));
+            } else {
+                document.querySelector('.filter-btn[data-cat="all"]').classList.remove('active');
+                activeCats.delete('all');
+                btn.classList.toggle('active');
+                if (btn.classList.contains('active')) activeCats.add(cat); else activeCats.delete(cat);
+                if (activeCats.size === 0) {
+                    activeCats.add('all');
+                    document.querySelector('.filter-btn[data-cat="all"]').classList.add('active');
+                }
+            }
+            applyFilters();
+        }
+
+        function applyFilters() {
+            const q = document.getElementById('search').value.toLowerCase().trim();
+            document.querySelectorAll('.card').forEach(card => {
+                const catOk = activeCats.has('all') || activeCats.has(card.dataset.category);
+                const txtOk = !q || card.dataset.text.includes(q) || card.dataset.filename.includes(q);
+                card.style.display = (catOk && txtOk) ? '' : 'none';
+            });
+        }
+
+        function sortCards() {
+            const mode = document.getElementById('sort').value;
+            const gallery = document.getElementById('gallery');
+            const cards = Array.from(gallery.querySelectorAll('.card'));
+            cards.sort((a, b) => {
+                const ca = parseFloat(a.dataset.confidence) || 0, cb = parseFloat(b.dataset.confidence) || 0;
+                switch (mode) {
+                    case 'conf-asc':  return ca - cb;
+                    case 'conf-desc': return cb - ca;
+                    case 'cat':  return a.dataset.category.localeCompare(b.dataset.category);
+                    case 'name': return a.dataset.filename.localeCompare(b.dataset.filename);
+                }
+            });
+            cards.forEach(c => gallery.appendChild(c));
+        }
+
+        function openLightbox(src) {
+            document.getElementById('lightbox-img').src = src;
+            document.getElementById('lightbox').style.display = 'flex';
+        }
+
+        sortCards();
+    </script>
+</body>
+</html>
+"""
+
+
+class WebReportGenerator:
+    def __init__(self, csv_path: Path, output_dir: Path = Path("captures/ok")):
+        self.csv_path = Path(csv_path).resolve()
+        self.output_dir = Path(output_dir).resolve()
+
+        # Si le CSV n'existe pas, on le cherche dans output_dir.
+        if not self.csv_path.exists():
+            potential_path = self.output_dir / self.csv_path.name
+            if potential_path.exists():
+                self.csv_path = potential_path
+
+    def _resolve_relative_path(self, row: dict) -> str:
+        """
+        Détermine le chemin de l'image relatif au rapport HTML (placé dans
+        output_dir), encodé pour une URL.
+
+        Robustesse : si le chemin enregistré n'existe pas (CSV obsolète d'avant
+        le déplacement), on reconstruit le chemin attendu
+        ``output_dir/catégorie/fichier``.
+        """
+        image_path = Path(row['filepath'])
+        category = row.get('detected_category') or ''
+
+        candidates = [image_path]
+        if category:
+            candidates.append(self.output_dir / category / image_path.name)
+        candidates.append(self.output_dir / image_path.name)
+
+        chosen = next((c for c in candidates if c.exists()), image_path)
+
+        try:
+            relative = chosen.relative_to(self.output_dir)
+        except ValueError:
+            # Repli : catégorie/fichier, sinon juste le nom du fichier.
+            relative = Path(category) / image_path.name if category else Path(image_path.name)
+
+        # Encodage URL (espaces, apostrophes typographiques, accents…) en
+        # préservant les séparateurs de dossiers.
+        return quote(relative.as_posix())
+
+    @staticmethod
+    def _confidence_fields(raw_value: str) -> dict:
+        try:
+            value = float(raw_value)
+        except (TypeError, ValueError):
+            value = 0.0
+        pct = round(value * 100)
+        if pct >= 60:
+            conf_class = "conf-high"
+        elif pct >= 35:
+            conf_class = "conf-mid"
+        else:
+            conf_class = "conf-low"
+        return {"confidence_value": value, "confidence_pct": pct, "conf_class": conf_class}
+
+    def generate(self):
+        items = []
+        if self.csv_path.exists():
+            with open(self.csv_path, 'r', encoding='utf-8') as f:
+                reader = csv.DictReader(f)
+                for row in reader:
+                    if row.get('status') == 'processed':
+                        row['relative_filepath'] = self._resolve_relative_path(row)
+                        row.update(self._confidence_fields(row.get('confidence')))
+                        items.append(row)
+
+        # Tri par défaut : confiance décroissante.
+        items.sort(key=lambda r: r['confidence_value'], reverse=True)
+
+        category_counts = Counter(item['detected_category'] for item in items)
+        # Catégories triées par effectif décroissant.
+        sorted_counts = sorted(category_counts.items(), key=lambda kv: (-kv[1], kv[0]))
+
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        output_path = self.output_dir / f"report_{timestamp}.html"
+
+        template = Template(TEMPLATE)
+        html = template.render(
+            items=items,
+            category_counts=sorted_counts,
+            category_colors=CATEGORY_COLORS,
+            default_color=DEFAULT_COLOR,
+            generated_at=datetime.now().strftime("%d/%m/%Y à %H:%M"),
+        )
+
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(html)
+        print(f"Rapport généré : {output_path}")
+        return output_path