#!/usr/bin/env python3 """ veille-rss.py — Pipeline de veille RSS vers Obsidian PKM Usage: python veille-rss.py # mode keyword scoring (défaut) python veille-rss.py --ai # scoring via Claude API python veille-rss.py --dry-run # affiche sans créer de notes python veille-rss.py --feed URL # teste un flux unique Config : veille-feeds.yaml (dans le même dossier) State : veille-state.json (créé automatiquement) Dépendances : pip install feedparser pyyaml anthropic """ import argparse import json import os import re import sys from datetime import datetime, timezone from pathlib import Path from textwrap import shorten import feedparser import yaml # --------------------------------------------------------------------------- # Chemins # --------------------------------------------------------------------------- SCRIPT_DIR = Path(__file__).parent CONFIG_FILE = SCRIPT_DIR / "veille-feeds.yaml" STATE_FILE = SCRIPT_DIR / "veille-state.json" # Chemin vers l'inbox Obsidian — relatif au script (40-playbooks -> racine -> 00-inbox) PKM_INBOX = SCRIPT_DIR.parent / "00-inbox" # --------------------------------------------------------------------------- # Chargement config # --------------------------------------------------------------------------- def load_config() -> dict: if not CONFIG_FILE.exists(): print(f"[ERREUR] Config introuvable : {CONFIG_FILE}") sys.exit(1) with open(CONFIG_FILE, encoding="utf-8") as f: return yaml.safe_load(f) # --------------------------------------------------------------------------- # State (articles déjà vus) # --------------------------------------------------------------------------- def load_state() -> set: if STATE_FILE.exists(): with open(STATE_FILE, encoding="utf-8") as f: data = json.load(f) return set(data.get("seen", [])) return set() def save_state(seen: set): # Garde les 5000 derniers IDs pour éviter une croissance infinie trimmed = list(seen)[-5000:] with open(STATE_FILE, "w", encoding="utf-8") as f: json.dump({"seen": trimmed, "updated": datetime.now().isoformat()}, f, indent=2) # --------------------------------------------------------------------------- # Scoring par mots-clés # --------------------------------------------------------------------------- def keyword_score(text: str, keywords: list[dict]) -> tuple[int, list[str]]: """ Retourne (score, matched_keywords). keywords = [{term: str, weight: int, role: str}] """ text_lower = text.lower() score = 0 matched = [] for kw in keywords: term = kw["term"].lower() weight = kw.get("weight", 1) if term in text_lower: score += weight matched.append(kw["term"]) return score, matched # --------------------------------------------------------------------------- # Scoring via Claude API # --------------------------------------------------------------------------- def ai_score(title: str, summary: str, roles: list[str], api_key: str) -> tuple[int, str]: """ Retourne (score 0-10, justification courte). """ try: import anthropic client = anthropic.Anthropic(api_key=api_key) roles_str = ", ".join(roles) prompt = f"""Tu es un assistant de veille pour un CTO/DSI français avec ces rôles : {roles_str}. Article : Titre : {title} Résumé : {summary[:500]} Donne un score de pertinence de 0 à 10 pour cet article, et une justification en une phrase. Réponds UNIQUEMENT au format JSON : {{"score": , "reason": ""}}""" msg = client.messages.create( model="claude-haiku-4-5-20251001", max_tokens=100, messages=[{"role": "user", "content": prompt}] ) result = json.loads(msg.content[0].text) return result["score"], result["reason"] except Exception as e: print(f" [AI score error] {e} — fallback keyword scoring") return -1, "" # --------------------------------------------------------------------------- # Création note Obsidian # --------------------------------------------------------------------------- def slugify(text: str) -> str: text = text.lower() text = re.sub(r"[àáâãäå]", "a", text) text = re.sub(r"[èéêë]", "e", text) text = re.sub(r"[ìíîï]", "i", text) text = re.sub(r"[òóôõö]", "o", text) text = re.sub(r"[ùúûü]", "u", text) text = re.sub(r"[ç]", "c", text) text = re.sub(r"[^a-z0-9\s-]", "", text) text = re.sub(r"[\s]+", "-", text.strip()) return text[:60] def create_note(item: dict, feed_meta: dict, score: int, matched: list[str], reason: str, dry_run: bool): title = item.get("title", "Sans titre") link = item.get("link", "") summary = item.get("summary", "") pub_date = item.get("published", datetime.now().strftime("%Y-%m-%d")) # Normalise la date try: parsed_date = feedparser._parse_date(pub_date) if parsed_date: pub_date = datetime(*parsed_date[:3]).strftime("%Y-%m-%d") except Exception: pub_date = datetime.now().strftime("%Y-%m-%d") # Nettoie le résumé HTML basique clean_summary = re.sub(r"<[^>]+>", "", summary).strip() clean_summary = shorten(clean_summary, width=500, placeholder="…") date_str = datetime.now().strftime("%Y-%m-%d") slug = slugify(title) filename = f"{date_str}-veille-{slug}.md" score_info = f"score: {score}" if matched: score_info += f" | mots-clés: {', '.join(matched)}" if reason: score_info += f" | {reason}" content = f"""--- title: "{title}" date: {date_str} type: veille source: {feed_meta.get('name', '')} categorie: {feed_meta.get('category', '')} role: {feed_meta.get('role', '')} url: {link} score: {score} statut: à traiter tags: [veille, {feed_meta.get('category', 'divers')}] --- # {title} > {score_info} {clean_summary} → [Lire l'article]({link}) """ if dry_run: print(f" [DRY-RUN] Note : {filename}") print(f" Score={score} | {', '.join(matched) if matched else reason}") return PKM_INBOX.mkdir(parents=True, exist_ok=True) note_path = PKM_INBOX / filename # Évite d'écraser si le fichier existe déjà if note_path.exists(): return with open(note_path, "w", encoding="utf-8") as f: f.write(content) print(f" ✓ Note créée : 00-inbox/{filename}") # --------------------------------------------------------------------------- # Traitement d'un flux # --------------------------------------------------------------------------- def process_feed(feed_cfg: dict, keywords: list[dict], seen: set, threshold: int, use_ai: bool, api_key: str, dry_run: bool) -> int: url = feed_cfg["url"] name = feed_cfg.get("name", url) print(f"\n📡 {name}") parsed = feedparser.parse(url) if parsed.bozo and not parsed.entries: print(f" [ERREUR] Impossible de lire le flux : {parsed.bozo_exception}") return 0 # Mots-clés spécifiques à ce flux (en plus des globaux) local_kws = feed_cfg.get("keywords", []) all_kws = keywords + local_kws roles = [feed_cfg.get("role", "général")] new_notes = 0 for entry in parsed.entries: entry_id = entry.get("id") or entry.get("link") or entry.get("title", "") if entry_id in seen: continue seen.add(entry_id) title = entry.get("title", "") summary = entry.get("summary", "") text = f"{title} {summary}" if use_ai and api_key: score, reason = ai_score(title, summary, roles, api_key) matched = [] if score == -1: # fallback score, matched = keyword_score(text, all_kws) reason = "" else: score, matched = keyword_score(text, all_kws) reason = "" if score >= threshold: create_note(entry, feed_cfg, score, matched, reason, dry_run) new_notes += 1 else: print(f" · Ignoré (score {score}) : {shorten(title, 70)}") if new_notes == 0: print(" Aucun nouvel article retenu.") return new_notes # --------------------------------------------------------------------------- # Main # --------------------------------------------------------------------------- def main(): parser = argparse.ArgumentParser(description="Pipeline veille RSS → Obsidian") parser.add_argument("--ai", action="store_true", help="Scoring via Claude API") parser.add_argument("--dry-run", action="store_true", help="Affiche sans créer de notes") parser.add_argument("--feed", metavar="URL", help="Teste un flux unique") args = parser.parse_args() config = load_config() seen = load_state() keywords = config.get("keywords", []) threshold = config.get("threshold", 2) api_key = config.get("anthropic_api_key", "") or os.environ.get("ANTHROPIC_API_KEY", "") if args.ai and not api_key: print("[AVERTISSEMENT] --ai activé mais ANTHROPIC_API_KEY non définie. Fallback keyword.") total = 0 if args.feed: # Flux unique passé en argument feed_cfg = {"url": args.feed, "name": args.feed, "category": "test", "role": "général"} total += process_feed(feed_cfg, keywords, seen, threshold, args.ai, api_key, args.dry_run) else: for feed_cfg in config.get("feeds", []): total += process_feed(feed_cfg, keywords, seen, threshold, args.ai, api_key, args.dry_run) if not args.dry_run: save_state(seen) print(f"\n✅ Terminé — {total} note(s) créée(s) dans 00-inbox/") if __name__ == "__main__": main()