pkm/40-playbooks/veille-rss.py

#!/usr/bin/env python3
"""
veille-rss.py — Pipeline de veille RSS vers Obsidian PKM

Usage:
    python veille-rss.py                    # mode keyword scoring (défaut)
    python veille-rss.py --ai               # scoring via Claude API
    python veille-rss.py --dry-run          # affiche sans créer de notes
    python veille-rss.py --feed URL         # teste un flux unique

Config : veille-feeds.yaml (dans le même dossier)
State  : veille-state.json (créé automatiquement)

Dépendances :
    pip install feedparser pyyaml anthropic
"""

import argparse
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from textwrap import shorten

import feedparser
import yaml

# ---------------------------------------------------------------------------
# Chemins
# ---------------------------------------------------------------------------
SCRIPT_DIR = Path(__file__).parent
CONFIG_FILE = SCRIPT_DIR / "veille-feeds.yaml"
STATE_FILE  = SCRIPT_DIR / "veille-state.json"

# Chemin vers l'inbox Obsidian — relatif au script (40-playbooks -> racine -> 00-inbox)
PKM_INBOX = SCRIPT_DIR.parent / "00-inbox"

# ---------------------------------------------------------------------------
# Chargement config
# ---------------------------------------------------------------------------
def load_config() -> dict:
    if not CONFIG_FILE.exists():
        print(f"[ERREUR] Config introuvable : {CONFIG_FILE}")
        sys.exit(1)
    with open(CONFIG_FILE, encoding="utf-8") as f:
        return yaml.safe_load(f)


# ---------------------------------------------------------------------------
# State (articles déjà vus)
# ---------------------------------------------------------------------------
def load_state() -> set:
    if STATE_FILE.exists():
        with open(STATE_FILE, encoding="utf-8") as f:
            data = json.load(f)
            return set(data.get("seen", []))
    return set()


def save_state(seen: set):
    # Garde les 5000 derniers IDs pour éviter une croissance infinie
    trimmed = list(seen)[-5000:]
    with open(STATE_FILE, "w", encoding="utf-8") as f:
        json.dump({"seen": trimmed, "updated": datetime.now().isoformat()}, f, indent=2)


# ---------------------------------------------------------------------------
# Scoring par mots-clés
# ---------------------------------------------------------------------------
def keyword_score(text: str, keywords: list[dict]) -> tuple[int, list[str]]:
    """
    Retourne (score, matched_keywords).
    keywords = [{term: str, weight: int, role: str}]
    """
    text_lower = text.lower()
    score = 0
    matched = []
    for kw in keywords:
        term = kw["term"].lower()
        weight = kw.get("weight", 1)
        if term in text_lower:
            score += weight
            matched.append(kw["term"])
    return score, matched


# ---------------------------------------------------------------------------
# Scoring via Claude API
# ---------------------------------------------------------------------------
def ai_score(title: str, summary: str, roles: list[str], api_key: str) -> tuple[int, str]:
    """
    Retourne (score 0-10, justification courte).
    """
    try:
        import anthropic
        client = anthropic.Anthropic(api_key=api_key)
        roles_str = ", ".join(roles)
        prompt = f"""Tu es un assistant de veille pour un CTO/DSI français avec ces rôles : {roles_str}.

Article :
Titre : {title}
Résumé : {summary[:500]}

Donne un score de pertinence de 0 à 10 pour cet article, et une justification en une phrase.
Réponds UNIQUEMENT au format JSON : {{"score": <int>, "reason": "<string>"}}"""

        msg = client.messages.create(
            model="claude-haiku-4-5-20251001",
            max_tokens=100,
            messages=[{"role": "user", "content": prompt}]
        )
        result = json.loads(msg.content[0].text)
        return result["score"], result["reason"]
    except Exception as e:
        print(f"  [AI score error] {e} — fallback keyword scoring")
        return -1, ""


# ---------------------------------------------------------------------------
# Création note Obsidian
# ---------------------------------------------------------------------------
def slugify(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[àáâãäå]", "a", text)
    text = re.sub(r"[èéêë]", "e", text)
    text = re.sub(r"[ìíîï]", "i", text)
    text = re.sub(r"[òóôõö]", "o", text)
    text = re.sub(r"[ùúûü]", "u", text)
    text = re.sub(r"[ç]", "c", text)
    text = re.sub(r"[^a-z0-9\s-]", "", text)
    text = re.sub(r"[\s]+", "-", text.strip())
    return text[:60]


def create_note(item: dict, feed_meta: dict, score: int, matched: list[str], reason: str, dry_run: bool):
    title    = item.get("title", "Sans titre")
    link     = item.get("link", "")
    summary  = item.get("summary", "")
    pub_date = item.get("published", datetime.now().strftime("%Y-%m-%d"))

    # Normalise la date
    try:
        parsed_date = feedparser._parse_date(pub_date)
        if parsed_date:
            pub_date = datetime(*parsed_date[:3]).strftime("%Y-%m-%d")
    except Exception:
        pub_date = datetime.now().strftime("%Y-%m-%d")

    # Nettoie le résumé HTML basique
    clean_summary = re.sub(r"<[^>]+>", "", summary).strip()
    clean_summary = shorten(clean_summary, width=500, placeholder="…")

    date_str  = datetime.now().strftime("%Y-%m-%d")
    slug      = slugify(title)
    filename  = f"{date_str}-veille-{slug}.md"

    score_info = f"score: {score}"
    if matched:
        score_info += f" | mots-clés: {', '.join(matched)}"
    if reason:
        score_info += f" | {reason}"

    content = f"""---
title: "{title}"
date: {date_str}
type: veille
source: {feed_meta.get('name', '')}
categorie: {feed_meta.get('category', '')}
role: {feed_meta.get('role', '')}
url: {link}
score: {score}
statut: à traiter
tags: [veille, {feed_meta.get('category', 'divers')}]
---

# {title}

> {score_info}

{clean_summary}

→ [Lire l'article]({link})
"""

    if dry_run:
        print(f"  [DRY-RUN] Note : {filename}")
        print(f"    Score={score} | {', '.join(matched) if matched else reason}")
        return

    PKM_INBOX.mkdir(parents=True, exist_ok=True)
    note_path = PKM_INBOX / filename
    # Évite d'écraser si le fichier existe déjà
    if note_path.exists():
        return
    with open(note_path, "w", encoding="utf-8") as f:
        f.write(content)
    print(f"  ✓ Note créée : 00-inbox/{filename}")


# ---------------------------------------------------------------------------
# Traitement d'un flux
# ---------------------------------------------------------------------------
def process_feed(feed_cfg: dict, keywords: list[dict], seen: set,
                 threshold: int, use_ai: bool, api_key: str,
                 dry_run: bool) -> int:
    url  = feed_cfg["url"]
    name = feed_cfg.get("name", url)
    print(f"\n📡 {name}")

    parsed = feedparser.parse(url)
    if parsed.bozo and not parsed.entries:
        print(f"  [ERREUR] Impossible de lire le flux : {parsed.bozo_exception}")
        return 0

    # Mots-clés spécifiques à ce flux (en plus des globaux)
    local_kws = feed_cfg.get("keywords", [])
    all_kws   = keywords + local_kws

    roles = [feed_cfg.get("role", "général")]
    new_notes = 0

    for entry in parsed.entries:
        entry_id = entry.get("id") or entry.get("link") or entry.get("title", "")
        if entry_id in seen:
            continue

        seen.add(entry_id)

        title   = entry.get("title", "")
        summary = entry.get("summary", "")
        text    = f"{title} {summary}"

        if use_ai and api_key:
            score, reason = ai_score(title, summary, roles, api_key)
            matched = []
            if score == -1:  # fallback
                score, matched = keyword_score(text, all_kws)
                reason = ""
        else:
            score, matched = keyword_score(text, all_kws)
            reason = ""

        if score >= threshold:
            create_note(entry, feed_cfg, score, matched, reason, dry_run)
            new_notes += 1
        else:
            print(f"  · Ignoré (score {score}) : {shorten(title, 70)}")

    if new_notes == 0:
        print("  Aucun nouvel article retenu.")
    return new_notes


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
    parser = argparse.ArgumentParser(description="Pipeline veille RSS → Obsidian")
    parser.add_argument("--ai",      action="store_true", help="Scoring via Claude API")
    parser.add_argument("--dry-run", action="store_true", help="Affiche sans créer de notes")
    parser.add_argument("--feed",    metavar="URL",        help="Teste un flux unique")
    args = parser.parse_args()

    config   = load_config()
    seen     = load_state()
    keywords = config.get("keywords", [])
    threshold = config.get("threshold", 2)
    api_key  = config.get("anthropic_api_key", "") or os.environ.get("ANTHROPIC_API_KEY", "")

    if args.ai and not api_key:
        print("[AVERTISSEMENT] --ai activé mais ANTHROPIC_API_KEY non définie. Fallback keyword.")

    total = 0

    if args.feed:
        # Flux unique passé en argument
        feed_cfg = {"url": args.feed, "name": args.feed, "category": "test", "role": "général"}
        total += process_feed(feed_cfg, keywords, seen, threshold, args.ai, api_key, args.dry_run)
    else:
        for feed_cfg in config.get("feeds", []):
            total += process_feed(feed_cfg, keywords, seen, threshold, args.ai, api_key, args.dry_run)

    if not args.dry_run:
        save_state(seen)

    print(f"\n✅ Terminé — {total} note(s) créée(s) dans 00-inbox/")


if __name__ == "__main__":
    main()