pkm/40-playbooks/veille-rss.py
Philippe e3223ef191 S24
2026-06-10 23:15:41 +02:00

292 lines
9.7 KiB
Python

#!/usr/bin/env python3
"""
veille-rss.py — Pipeline de veille RSS vers Obsidian PKM
Usage:
python veille-rss.py # mode keyword scoring (défaut)
python veille-rss.py --ai # scoring via Claude API
python veille-rss.py --dry-run # affiche sans créer de notes
python veille-rss.py --feed URL # teste un flux unique
Config : veille-feeds.yaml (dans le même dossier)
State : veille-state.json (créé automatiquement)
Dépendances :
pip install feedparser pyyaml anthropic
"""
import argparse
import json
import os
import re
import sys
from datetime import datetime, timezone
from pathlib import Path
from textwrap import shorten
import feedparser
import yaml
# ---------------------------------------------------------------------------
# Chemins
# ---------------------------------------------------------------------------
SCRIPT_DIR = Path(__file__).parent
CONFIG_FILE = SCRIPT_DIR / "veille-feeds.yaml"
STATE_FILE = SCRIPT_DIR / "veille-state.json"
# Chemin vers l'inbox Obsidian — relatif au script (40-playbooks -> racine -> 00-inbox)
PKM_INBOX = SCRIPT_DIR.parent / "00-inbox"
# ---------------------------------------------------------------------------
# Chargement config
# ---------------------------------------------------------------------------
def load_config() -> dict:
if not CONFIG_FILE.exists():
print(f"[ERREUR] Config introuvable : {CONFIG_FILE}")
sys.exit(1)
with open(CONFIG_FILE, encoding="utf-8") as f:
return yaml.safe_load(f)
# ---------------------------------------------------------------------------
# State (articles déjà vus)
# ---------------------------------------------------------------------------
def load_state() -> set:
if STATE_FILE.exists():
with open(STATE_FILE, encoding="utf-8") as f:
data = json.load(f)
return set(data.get("seen", []))
return set()
def save_state(seen: set):
# Garde les 5000 derniers IDs pour éviter une croissance infinie
trimmed = list(seen)[-5000:]
with open(STATE_FILE, "w", encoding="utf-8") as f:
json.dump({"seen": trimmed, "updated": datetime.now().isoformat()}, f, indent=2)
# ---------------------------------------------------------------------------
# Scoring par mots-clés
# ---------------------------------------------------------------------------
def keyword_score(text: str, keywords: list[dict]) -> tuple[int, list[str]]:
"""
Retourne (score, matched_keywords).
keywords = [{term: str, weight: int, role: str}]
"""
text_lower = text.lower()
score = 0
matched = []
for kw in keywords:
term = kw["term"].lower()
weight = kw.get("weight", 1)
if term in text_lower:
score += weight
matched.append(kw["term"])
return score, matched
# ---------------------------------------------------------------------------
# Scoring via Claude API
# ---------------------------------------------------------------------------
def ai_score(title: str, summary: str, roles: list[str], api_key: str) -> tuple[int, str]:
"""
Retourne (score 0-10, justification courte).
"""
try:
import anthropic
client = anthropic.Anthropic(api_key=api_key)
roles_str = ", ".join(roles)
prompt = f"""Tu es un assistant de veille pour un CTO/DSI français avec ces rôles : {roles_str}.
Article :
Titre : {title}
Résumé : {summary[:500]}
Donne un score de pertinence de 0 à 10 pour cet article, et une justification en une phrase.
Réponds UNIQUEMENT au format JSON : {{"score": <int>, "reason": "<string>"}}"""
msg = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=100,
messages=[{"role": "user", "content": prompt}]
)
result = json.loads(msg.content[0].text)
return result["score"], result["reason"]
except Exception as e:
print(f" [AI score error] {e} — fallback keyword scoring")
return -1, ""
# ---------------------------------------------------------------------------
# Création note Obsidian
# ---------------------------------------------------------------------------
def slugify(text: str) -> str:
text = text.lower()
text = re.sub(r"[àáâãäå]", "a", text)
text = re.sub(r"[èéêë]", "e", text)
text = re.sub(r"[ìíîï]", "i", text)
text = re.sub(r"[òóôõö]", "o", text)
text = re.sub(r"[ùúûü]", "u", text)
text = re.sub(r"[ç]", "c", text)
text = re.sub(r"[^a-z0-9\s-]", "", text)
text = re.sub(r"[\s]+", "-", text.strip())
return text[:60]
def create_note(item: dict, feed_meta: dict, score: int, matched: list[str], reason: str, dry_run: bool):
title = item.get("title", "Sans titre")
link = item.get("link", "")
summary = item.get("summary", "")
pub_date = item.get("published", datetime.now().strftime("%Y-%m-%d"))
# Normalise la date
try:
parsed_date = feedparser._parse_date(pub_date)
if parsed_date:
pub_date = datetime(*parsed_date[:3]).strftime("%Y-%m-%d")
except Exception:
pub_date = datetime.now().strftime("%Y-%m-%d")
# Nettoie le résumé HTML basique
clean_summary = re.sub(r"<[^>]+>", "", summary).strip()
clean_summary = shorten(clean_summary, width=500, placeholder="")
date_str = datetime.now().strftime("%Y-%m-%d")
slug = slugify(title)
filename = f"{date_str}-veille-{slug}.md"
score_info = f"score: {score}"
if matched:
score_info += f" | mots-clés: {', '.join(matched)}"
if reason:
score_info += f" | {reason}"
content = f"""---
title: "{title}"
date: {date_str}
type: veille
source: {feed_meta.get('name', '')}
categorie: {feed_meta.get('category', '')}
role: {feed_meta.get('role', '')}
url: {link}
score: {score}
statut: à traiter
tags: [veille, {feed_meta.get('category', 'divers')}]
---
# {title}
> {score_info}
{clean_summary}
→ [Lire l'article]({link})
"""
if dry_run:
print(f" [DRY-RUN] Note : {filename}")
print(f" Score={score} | {', '.join(matched) if matched else reason}")
return
PKM_INBOX.mkdir(parents=True, exist_ok=True)
note_path = PKM_INBOX / filename
# Évite d'écraser si le fichier existe déjà
if note_path.exists():
return
with open(note_path, "w", encoding="utf-8") as f:
f.write(content)
print(f" ✓ Note créée : 00-inbox/{filename}")
# ---------------------------------------------------------------------------
# Traitement d'un flux
# ---------------------------------------------------------------------------
def process_feed(feed_cfg: dict, keywords: list[dict], seen: set,
threshold: int, use_ai: bool, api_key: str,
dry_run: bool) -> int:
url = feed_cfg["url"]
name = feed_cfg.get("name", url)
print(f"\n📡 {name}")
parsed = feedparser.parse(url)
if parsed.bozo and not parsed.entries:
print(f" [ERREUR] Impossible de lire le flux : {parsed.bozo_exception}")
return 0
# Mots-clés spécifiques à ce flux (en plus des globaux)
local_kws = feed_cfg.get("keywords", [])
all_kws = keywords + local_kws
roles = [feed_cfg.get("role", "général")]
new_notes = 0
for entry in parsed.entries:
entry_id = entry.get("id") or entry.get("link") or entry.get("title", "")
if entry_id in seen:
continue
seen.add(entry_id)
title = entry.get("title", "")
summary = entry.get("summary", "")
text = f"{title} {summary}"
if use_ai and api_key:
score, reason = ai_score(title, summary, roles, api_key)
matched = []
if score == -1: # fallback
score, matched = keyword_score(text, all_kws)
reason = ""
else:
score, matched = keyword_score(text, all_kws)
reason = ""
if score >= threshold:
create_note(entry, feed_cfg, score, matched, reason, dry_run)
new_notes += 1
else:
print(f" · Ignoré (score {score}) : {shorten(title, 70)}")
if new_notes == 0:
print(" Aucun nouvel article retenu.")
return new_notes
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main():
parser = argparse.ArgumentParser(description="Pipeline veille RSS → Obsidian")
parser.add_argument("--ai", action="store_true", help="Scoring via Claude API")
parser.add_argument("--dry-run", action="store_true", help="Affiche sans créer de notes")
parser.add_argument("--feed", metavar="URL", help="Teste un flux unique")
args = parser.parse_args()
config = load_config()
seen = load_state()
keywords = config.get("keywords", [])
threshold = config.get("threshold", 2)
api_key = config.get("anthropic_api_key", "") or os.environ.get("ANTHROPIC_API_KEY", "")
if args.ai and not api_key:
print("[AVERTISSEMENT] --ai activé mais ANTHROPIC_API_KEY non définie. Fallback keyword.")
total = 0
if args.feed:
# Flux unique passé en argument
feed_cfg = {"url": args.feed, "name": args.feed, "category": "test", "role": "général"}
total += process_feed(feed_cfg, keywords, seen, threshold, args.ai, api_key, args.dry_run)
else:
for feed_cfg in config.get("feeds", []):
total += process_feed(feed_cfg, keywords, seen, threshold, args.ai, api_key, args.dry_run)
if not args.dry_run:
save_state(seen)
print(f"\n✅ Terminé — {total} note(s) créée(s) dans 00-inbox/")
if __name__ == "__main__":
main()