292 lines
9.7 KiB
Python
292 lines
9.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
veille-rss.py — Pipeline de veille RSS vers Obsidian PKM
|
|
|
|
Usage:
|
|
python veille-rss.py # mode keyword scoring (défaut)
|
|
python veille-rss.py --ai # scoring via Claude API
|
|
python veille-rss.py --dry-run # affiche sans créer de notes
|
|
python veille-rss.py --feed URL # teste un flux unique
|
|
|
|
Config : veille-feeds.yaml (dans le même dossier)
|
|
State : veille-state.json (créé automatiquement)
|
|
|
|
Dépendances :
|
|
pip install feedparser pyyaml anthropic
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
from datetime import datetime, timezone
|
|
from pathlib import Path
|
|
from textwrap import shorten
|
|
|
|
import feedparser
|
|
import yaml
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Chemins
|
|
# ---------------------------------------------------------------------------
|
|
SCRIPT_DIR = Path(__file__).parent
|
|
CONFIG_FILE = SCRIPT_DIR / "veille-feeds.yaml"
|
|
STATE_FILE = SCRIPT_DIR / "veille-state.json"
|
|
|
|
# Chemin vers l'inbox Obsidian — relatif au script (40-playbooks -> racine -> 00-inbox)
|
|
PKM_INBOX = SCRIPT_DIR.parent / "00-inbox"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Chargement config
|
|
# ---------------------------------------------------------------------------
|
|
def load_config() -> dict:
|
|
if not CONFIG_FILE.exists():
|
|
print(f"[ERREUR] Config introuvable : {CONFIG_FILE}")
|
|
sys.exit(1)
|
|
with open(CONFIG_FILE, encoding="utf-8") as f:
|
|
return yaml.safe_load(f)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# State (articles déjà vus)
|
|
# ---------------------------------------------------------------------------
|
|
def load_state() -> set:
|
|
if STATE_FILE.exists():
|
|
with open(STATE_FILE, encoding="utf-8") as f:
|
|
data = json.load(f)
|
|
return set(data.get("seen", []))
|
|
return set()
|
|
|
|
|
|
def save_state(seen: set):
|
|
# Garde les 5000 derniers IDs pour éviter une croissance infinie
|
|
trimmed = list(seen)[-5000:]
|
|
with open(STATE_FILE, "w", encoding="utf-8") as f:
|
|
json.dump({"seen": trimmed, "updated": datetime.now().isoformat()}, f, indent=2)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Scoring par mots-clés
|
|
# ---------------------------------------------------------------------------
|
|
def keyword_score(text: str, keywords: list[dict]) -> tuple[int, list[str]]:
|
|
"""
|
|
Retourne (score, matched_keywords).
|
|
keywords = [{term: str, weight: int, role: str}]
|
|
"""
|
|
text_lower = text.lower()
|
|
score = 0
|
|
matched = []
|
|
for kw in keywords:
|
|
term = kw["term"].lower()
|
|
weight = kw.get("weight", 1)
|
|
if term in text_lower:
|
|
score += weight
|
|
matched.append(kw["term"])
|
|
return score, matched
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Scoring via Claude API
|
|
# ---------------------------------------------------------------------------
|
|
def ai_score(title: str, summary: str, roles: list[str], api_key: str) -> tuple[int, str]:
|
|
"""
|
|
Retourne (score 0-10, justification courte).
|
|
"""
|
|
try:
|
|
import anthropic
|
|
client = anthropic.Anthropic(api_key=api_key)
|
|
roles_str = ", ".join(roles)
|
|
prompt = f"""Tu es un assistant de veille pour un CTO/DSI français avec ces rôles : {roles_str}.
|
|
|
|
Article :
|
|
Titre : {title}
|
|
Résumé : {summary[:500]}
|
|
|
|
Donne un score de pertinence de 0 à 10 pour cet article, et une justification en une phrase.
|
|
Réponds UNIQUEMENT au format JSON : {{"score": <int>, "reason": "<string>"}}"""
|
|
|
|
msg = client.messages.create(
|
|
model="claude-haiku-4-5-20251001",
|
|
max_tokens=100,
|
|
messages=[{"role": "user", "content": prompt}]
|
|
)
|
|
result = json.loads(msg.content[0].text)
|
|
return result["score"], result["reason"]
|
|
except Exception as e:
|
|
print(f" [AI score error] {e} — fallback keyword scoring")
|
|
return -1, ""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Création note Obsidian
|
|
# ---------------------------------------------------------------------------
|
|
def slugify(text: str) -> str:
|
|
text = text.lower()
|
|
text = re.sub(r"[àáâãäå]", "a", text)
|
|
text = re.sub(r"[èéêë]", "e", text)
|
|
text = re.sub(r"[ìíîï]", "i", text)
|
|
text = re.sub(r"[òóôõö]", "o", text)
|
|
text = re.sub(r"[ùúûü]", "u", text)
|
|
text = re.sub(r"[ç]", "c", text)
|
|
text = re.sub(r"[^a-z0-9\s-]", "", text)
|
|
text = re.sub(r"[\s]+", "-", text.strip())
|
|
return text[:60]
|
|
|
|
|
|
def create_note(item: dict, feed_meta: dict, score: int, matched: list[str], reason: str, dry_run: bool):
|
|
title = item.get("title", "Sans titre")
|
|
link = item.get("link", "")
|
|
summary = item.get("summary", "")
|
|
pub_date = item.get("published", datetime.now().strftime("%Y-%m-%d"))
|
|
|
|
# Normalise la date
|
|
try:
|
|
parsed_date = feedparser._parse_date(pub_date)
|
|
if parsed_date:
|
|
pub_date = datetime(*parsed_date[:3]).strftime("%Y-%m-%d")
|
|
except Exception:
|
|
pub_date = datetime.now().strftime("%Y-%m-%d")
|
|
|
|
# Nettoie le résumé HTML basique
|
|
clean_summary = re.sub(r"<[^>]+>", "", summary).strip()
|
|
clean_summary = shorten(clean_summary, width=500, placeholder="…")
|
|
|
|
date_str = datetime.now().strftime("%Y-%m-%d")
|
|
slug = slugify(title)
|
|
filename = f"{date_str}-veille-{slug}.md"
|
|
|
|
score_info = f"score: {score}"
|
|
if matched:
|
|
score_info += f" | mots-clés: {', '.join(matched)}"
|
|
if reason:
|
|
score_info += f" | {reason}"
|
|
|
|
content = f"""---
|
|
title: "{title}"
|
|
date: {date_str}
|
|
type: veille
|
|
source: {feed_meta.get('name', '')}
|
|
categorie: {feed_meta.get('category', '')}
|
|
role: {feed_meta.get('role', '')}
|
|
url: {link}
|
|
score: {score}
|
|
statut: à traiter
|
|
tags: [veille, {feed_meta.get('category', 'divers')}]
|
|
---
|
|
|
|
# {title}
|
|
|
|
> {score_info}
|
|
|
|
{clean_summary}
|
|
|
|
→ [Lire l'article]({link})
|
|
"""
|
|
|
|
if dry_run:
|
|
print(f" [DRY-RUN] Note : {filename}")
|
|
print(f" Score={score} | {', '.join(matched) if matched else reason}")
|
|
return
|
|
|
|
PKM_INBOX.mkdir(parents=True, exist_ok=True)
|
|
note_path = PKM_INBOX / filename
|
|
# Évite d'écraser si le fichier existe déjà
|
|
if note_path.exists():
|
|
return
|
|
with open(note_path, "w", encoding="utf-8") as f:
|
|
f.write(content)
|
|
print(f" ✓ Note créée : 00-inbox/{filename}")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Traitement d'un flux
|
|
# ---------------------------------------------------------------------------
|
|
def process_feed(feed_cfg: dict, keywords: list[dict], seen: set,
|
|
threshold: int, use_ai: bool, api_key: str,
|
|
dry_run: bool) -> int:
|
|
url = feed_cfg["url"]
|
|
name = feed_cfg.get("name", url)
|
|
print(f"\n📡 {name}")
|
|
|
|
parsed = feedparser.parse(url)
|
|
if parsed.bozo and not parsed.entries:
|
|
print(f" [ERREUR] Impossible de lire le flux : {parsed.bozo_exception}")
|
|
return 0
|
|
|
|
# Mots-clés spécifiques à ce flux (en plus des globaux)
|
|
local_kws = feed_cfg.get("keywords", [])
|
|
all_kws = keywords + local_kws
|
|
|
|
roles = [feed_cfg.get("role", "général")]
|
|
new_notes = 0
|
|
|
|
for entry in parsed.entries:
|
|
entry_id = entry.get("id") or entry.get("link") or entry.get("title", "")
|
|
if entry_id in seen:
|
|
continue
|
|
|
|
seen.add(entry_id)
|
|
|
|
title = entry.get("title", "")
|
|
summary = entry.get("summary", "")
|
|
text = f"{title} {summary}"
|
|
|
|
if use_ai and api_key:
|
|
score, reason = ai_score(title, summary, roles, api_key)
|
|
matched = []
|
|
if score == -1: # fallback
|
|
score, matched = keyword_score(text, all_kws)
|
|
reason = ""
|
|
else:
|
|
score, matched = keyword_score(text, all_kws)
|
|
reason = ""
|
|
|
|
if score >= threshold:
|
|
create_note(entry, feed_cfg, score, matched, reason, dry_run)
|
|
new_notes += 1
|
|
else:
|
|
print(f" · Ignoré (score {score}) : {shorten(title, 70)}")
|
|
|
|
if new_notes == 0:
|
|
print(" Aucun nouvel article retenu.")
|
|
return new_notes
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Pipeline veille RSS → Obsidian")
|
|
parser.add_argument("--ai", action="store_true", help="Scoring via Claude API")
|
|
parser.add_argument("--dry-run", action="store_true", help="Affiche sans créer de notes")
|
|
parser.add_argument("--feed", metavar="URL", help="Teste un flux unique")
|
|
args = parser.parse_args()
|
|
|
|
config = load_config()
|
|
seen = load_state()
|
|
keywords = config.get("keywords", [])
|
|
threshold = config.get("threshold", 2)
|
|
api_key = config.get("anthropic_api_key", "") or os.environ.get("ANTHROPIC_API_KEY", "")
|
|
|
|
if args.ai and not api_key:
|
|
print("[AVERTISSEMENT] --ai activé mais ANTHROPIC_API_KEY non définie. Fallback keyword.")
|
|
|
|
total = 0
|
|
|
|
if args.feed:
|
|
# Flux unique passé en argument
|
|
feed_cfg = {"url": args.feed, "name": args.feed, "category": "test", "role": "général"}
|
|
total += process_feed(feed_cfg, keywords, seen, threshold, args.ai, api_key, args.dry_run)
|
|
else:
|
|
for feed_cfg in config.get("feeds", []):
|
|
total += process_feed(feed_cfg, keywords, seen, threshold, args.ai, api_key, args.dry_run)
|
|
|
|
if not args.dry_run:
|
|
save_state(seen)
|
|
|
|
print(f"\n✅ Terminé — {total} note(s) créée(s) dans 00-inbox/")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|