Files
Lycostorrent/app/rss_source.py
2026-03-23 20:59:26 +01:00

559 lines
20 KiB
Python

"""
RSS Source - Parser générique pour flux RSS de trackers torrent
Permet d'ajouter n'importe quel tracker qui fournit un flux RSS
"""
import os
import requests
import logging
import re
import xml.etree.ElementTree as ET
from datetime import datetime
from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
logger = logging.getLogger(__name__)
class RSSSource:
"""Classe pour gérer les flux RSS de trackers torrent"""
# Protocoles autorisés
ALLOWED_PROTOCOLS = ['http', 'https']
def __init__(self, flaresolverr_url=None):
self.session = requests.Session()
self.flaresolverr_url = flaresolverr_url or os.getenv('FLARESOLVERR_URL', '')
# User-Agent réaliste pour éviter les blocages
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Accept': 'application/rss+xml, application/xml, text/xml, */*',
'Accept-Language': 'fr-FR,fr;q=0.9,en-US;q=0.8,en;q=0.7',
'Accept-Encoding': 'gzip, deflate, br',
'Connection': 'keep-alive',
'Cache-Control': 'no-cache'
})
if self.flaresolverr_url:
logger.info(f"✅ Flaresolverr configuré: {self.flaresolverr_url}")
def _validate_url(self, url):
"""Valide qu'une URL est sûre"""
try:
parsed = urlparse(url)
if parsed.scheme not in self.ALLOWED_PROTOCOLS:
logger.warning(f"⚠️ Protocole non autorisé: {parsed.scheme}")
return False
if not parsed.netloc:
logger.warning("⚠️ URL sans domaine")
return False
# Bloquer les URLs locales (sécurité SSRF)
if parsed.netloc in ['localhost', '127.0.0.1', '0.0.0.0']:
logger.warning("⚠️ URL locale bloquée")
return False
return True
except Exception as e:
logger.warning(f"⚠️ URL invalide: {e}")
return False
def fetch_feed(self, feed_config, max_results=50):
"""
Récupère et parse un flux RSS.
Args:
feed_config: dict avec {url, name, category, passkey, use_flaresolverr, cookies}
max_results: nombre max de résultats
Returns:
Liste de résultats formatés comme Jackett/Prowlarr
"""
try:
url = feed_config.get('url', '')
name = feed_config.get('name', 'RSS')
passkey = feed_config.get('passkey', '')
use_flaresolverr = feed_config.get('use_flaresolverr', False)
cookies = feed_config.get('cookies', '')
# Injecter le passkey si présent
if passkey and '{passkey}' in url:
url = url.replace('{passkey}', passkey)
# Valider l'URL
if not self._validate_url(url):
logger.error(f"❌ RSS {name}: URL invalide ou non autorisée")
return []
logger.info(f"🔗 RSS {name}: Fetching {self._mask_url(url)}")
# Utiliser Flaresolverr si activé et configuré
if use_flaresolverr and self.flaresolverr_url:
content = self._fetch_with_flaresolverr(url, cookies)
if content is None:
return []
else:
# Requête directe avec cookies si fournis
if cookies:
cookie_dict = self._parse_cookies(cookies)
response = self.session.get(url, timeout=30, cookies=cookie_dict)
else:
response = self.session.get(url, timeout=30)
response.raise_for_status()
content = response.content
# Parser le XML
results = self._parse_rss(content, name)
logger.info(f"📦 RSS {name}: {len(results)} résultats")
return results[:max_results]
except requests.exceptions.Timeout:
logger.error(f"⏱️ RSS {feed_config.get('name', 'Unknown')}: Timeout")
return []
except requests.exceptions.RequestException as e:
logger.error(f"❌ RSS {feed_config.get('name', 'Unknown')}: Erreur connexion - {e}")
return []
except Exception as e:
logger.error(f"❌ RSS {feed_config.get('name', 'Unknown')}: Erreur - {e}", exc_info=True)
return []
def _parse_cookies(self, cookies_str):
"""Parse une chaîne de cookies en dictionnaire"""
cookie_dict = {}
if not cookies_str:
return cookie_dict
# Format: "nom1=valeur1; nom2=valeur2"
for part in cookies_str.split(';'):
part = part.strip()
if '=' in part:
key, value = part.split('=', 1)
cookie_dict[key.strip()] = value.strip()
return cookie_dict
def _fetch_with_flaresolverr(self, url, cookies=''):
"""Récupère une URL via Flaresolverr pour bypass Cloudflare"""
try:
logger.info(f"🛡️ Utilisation de Flaresolverr pour: {self._mask_url(url)}")
# Utiliser une session persistante pour garder les cookies
session_id = "lycostorrent_session"
payload = {
"cmd": "request.get",
"url": url,
"session": session_id,
"maxTimeout": 60000
}
# Ajouter les cookies si fournis
if cookies:
cookie_list = []
for part in cookies.split(';'):
part = part.strip()
if '=' in part:
key, value = part.split('=', 1)
cookie_list.append({
"name": key.strip(),
"value": value.strip(),
"domain": self._extract_domain(url)
})
if cookie_list:
payload["cookies"] = cookie_list
logger.info(f"🍪 {len(cookie_list)} cookies ajoutés à la requête")
response = requests.post(
f"{self.flaresolverr_url}/v1",
json=payload,
timeout=65
)
response.raise_for_status()
data = response.json()
if data.get('status') == 'ok':
solution = data.get('solution', {})
html_content = solution.get('response', '')
logger.info(f"✅ Flaresolverr: succès (status {solution.get('status', 'N/A')})")
# Vérifier si c'est du XML RSS
if html_content.strip().startswith('<?xml') or '<rss' in html_content[:500]:
logger.info("📄 Contenu XML détecté")
return html_content.encode('utf-8')
# Si ce n'est pas du XML, c'est probablement une page de login
if 'login' in html_content.lower() or 'connexion' in html_content.lower():
logger.warning("⚠️ Page de connexion détectée - vérifiez vos cookies")
else:
logger.warning(f"⚠️ Le contenu ne semble pas être du XML RSS")
logger.debug(f"📄 Début du contenu: {html_content[:500]}")
# Retourner quand même pour essayer de parser
return html_content.encode('utf-8')
else:
logger.error(f"❌ Flaresolverr error: {data.get('message', 'Unknown error')}")
return None
except requests.exceptions.Timeout:
logger.error("⏱️ Flaresolverr timeout")
return None
except Exception as e:
logger.error(f"❌ Flaresolverr error: {e}")
return None
def _extract_domain(self, url):
"""Extrait le domaine d'une URL"""
try:
parsed = urlparse(url)
# Retourner le domaine avec le point devant pour matcher les sous-domaines
domain = parsed.netloc
if domain.startswith('www.'):
domain = domain[4:]
return f".{domain}"
except:
return ".yggtorrent.org"
def _parse_rss(self, content, source_name):
"""Parse le contenu XML RSS et retourne une liste de résultats"""
results = []
try:
root = ET.fromstring(content)
# Trouver les items (RSS 2.0 ou Atom)
items = root.findall('.//item')
if not items:
# Essayer le format Atom
ns = {'atom': 'http://www.w3.org/2005/Atom'}
items = root.findall('.//atom:entry', ns)
for item in items:
result = self._parse_item(item, source_name)
if result:
results.append(result)
except ET.ParseError as e:
logger.error(f"❌ Erreur parsing XML: {e}")
return results
def _parse_item(self, item, source_name):
"""Parse un item RSS individuel"""
try:
# Namespaces courants
ns = {
'torrent': 'http://xmlns.ezrss.it/0.1/',
'atom': 'http://www.w3.org/2005/Atom',
'newznab': 'http://www.newznab.com/DTD/2010/feeds/attributes/'
}
# Titre
title = self._get_text(item, 'title') or ''
if not title:
return None
# Lien torrent/magnet
link = self._get_text(item, 'link') or ''
enclosure = item.find('enclosure')
if enclosure is not None:
link = enclosure.get('url', link)
# Magnet URI
magnet = ''
magnet_el = item.find('.//torrent:magnetURI', ns)
if magnet_el is not None and magnet_el.text:
magnet = magnet_el.text
elif link.startswith('magnet:'):
magnet = link
link = ''
# Taille
size = 0
size_el = item.find('.//torrent:contentLength', ns)
if size_el is not None and size_el.text:
try:
size = int(size_el.text)
except ValueError:
pass
# Essayer avec enclosure
if size == 0 and enclosure is not None:
try:
size = int(enclosure.get('length', 0))
except ValueError:
pass
# Essayer avec newznab:attr
if size == 0:
for attr in item.findall('.//newznab:attr', ns):
if attr.get('name') == 'size':
try:
size = int(attr.get('value', 0))
except ValueError:
pass
break
# Seeders/Leechers (si disponibles)
seeders = 0
leechers = 0
seeders_el = item.find('.//torrent:seeds', ns)
if seeders_el is not None and seeders_el.text:
try:
seeders = int(seeders_el.text)
except ValueError:
pass
leechers_el = item.find('.//torrent:peers', ns)
if leechers_el is not None and leechers_el.text:
try:
leechers = int(leechers_el.text)
except ValueError:
pass
# Date de publication
pub_date = self._get_text(item, 'pubDate') or ''
pub_date_formatted = self._format_date(pub_date)
pub_date_iso = self._parse_date_to_iso(pub_date)
# Lien détails
details = self._get_text(item, 'guid') or self._get_text(item, 'link') or ''
if details.startswith('magnet:'):
details = ''
# Catégorie
category = self._get_text(item, 'category') or ''
return {
'Title': title,
'Link': link if not link.startswith('magnet:') else '',
'MagnetUri': magnet,
'Size': size,
'SizeFormatted': self._format_size(size),
'Seeders': seeders,
'Peers': leechers,
'PublishDate': pub_date_formatted,
'PublishDateRaw': pub_date_iso or pub_date,
'Tracker': source_name,
'Details': details,
'Category': category,
'Source': 'rss'
}
except Exception as e:
logger.warning(f"⚠️ Erreur parsing item RSS: {e}")
return None
def _get_text(self, element, tag):
"""Récupère le texte d'un sous-élément"""
el = element.find(tag)
if el is not None and el.text:
return el.text.strip()
return None
def _format_date(self, date_str):
"""Formate une date RSS en format lisible"""
if not date_str:
return ''
try:
# Format RSS standard: "Tue, 24 Dec 2025 10:30:00 +0000"
for fmt in [
'%a, %d %b %Y %H:%M:%S %z',
'%a, %d %b %Y %H:%M:%S %Z',
'%Y-%m-%dT%H:%M:%S%z',
'%Y-%m-%dT%H:%M:%SZ',
'%Y-%m-%d %H:%M:%S',
]:
try:
dt = datetime.strptime(date_str.strip(), fmt)
return dt.strftime('%d/%m/%Y %H:%M')
except ValueError:
continue
# Si aucun format ne marche, retourner tel quel
return date_str[:16] if len(date_str) > 16 else date_str
except Exception:
return date_str
def _parse_date_to_iso(self, date_str):
"""Convertit une date RSS en format ISO pour le tri"""
if not date_str:
return ''
try:
# Formats de date courants
for fmt in [
'%a, %d %b %Y %H:%M:%S %z',
'%a, %d %b %Y %H:%M:%S %Z',
'%a, %d %b %Y %H:%M:%S',
'%Y-%m-%dT%H:%M:%S%z',
'%Y-%m-%dT%H:%M:%SZ',
'%Y-%m-%d %H:%M:%S',
'%d/%m/%Y %H:%M:%S',
'%d/%m/%Y %H:%M',
]:
try:
dt = datetime.strptime(date_str.strip(), fmt)
# Retourner en format ISO triable
return dt.strftime('%Y-%m-%dT%H:%M:%S')
except ValueError:
continue
return ''
except Exception:
return ''
def _format_size(self, size_bytes):
"""Formate une taille en bytes en format lisible"""
if not size_bytes or size_bytes == 0:
return 'N/A'
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size_bytes < 1024:
return f"{size_bytes:.1f} {unit}"
size_bytes /= 1024
return f"{size_bytes:.1f} PB"
def _mask_url(self, url):
"""Masque les informations sensibles dans l'URL pour les logs"""
try:
parsed = urlparse(url)
query = parse_qs(parsed.query)
# Masquer passkey, apikey, etc.
sensitive_keys = ['passkey', 'apikey', 'api_key', 'key', 'token', 'auth']
for key in sensitive_keys:
if key in query:
query[key] = ['***']
# Reconstruire l'URL
new_query = urlencode(query, doseq=True)
masked = urlunparse((parsed.scheme, parsed.netloc, parsed.path,
parsed.params, new_query, parsed.fragment))
return masked
except Exception:
return url[:50] + '...'
class RSSManager:
"""Gestionnaire des flux RSS configurés"""
def __init__(self, config_path='/app/config/rss_feeds.json'):
self.config_path = config_path
self.rss_source = RSSSource()
self.feeds = []
self.load_config()
def load_config(self):
"""Charge la configuration des flux RSS"""
import json
import os
try:
if os.path.exists(self.config_path):
with open(self.config_path, 'r') as f:
data = json.load(f)
self.feeds = data.get('feeds', [])
logger.info(f"{len(self.feeds)} flux RSS configurés")
else:
self.feeds = []
logger.info("📝 Aucun flux RSS configuré")
except Exception as e:
logger.error(f"❌ Erreur chargement config RSS: {e}")
self.feeds = []
def save_config(self):
"""Sauvegarde la configuration des flux RSS"""
import json
import os
try:
os.makedirs(os.path.dirname(self.config_path), exist_ok=True)
with open(self.config_path, 'w') as f:
json.dump({'feeds': self.feeds}, f, indent=2)
logger.info(f"✅ Configuration RSS sauvegardée")
return True
except Exception as e:
logger.error(f"❌ Erreur sauvegarde config RSS: {e}")
return False
def get_feeds(self, category=None):
"""Retourne les flux RSS, optionnellement filtrés par catégorie"""
if category:
return [f for f in self.feeds if f.get('category') == category or not f.get('category')]
return self.feeds
def get_feeds_for_latest(self, category):
"""Retourne les flux RSS configurés pour une catégorie de nouveautés"""
matching = []
for feed in self.feeds:
if feed.get('enabled', True):
feed_cat = feed.get('category', '')
if feed_cat == category or feed_cat == 'all':
matching.append(feed)
return matching
def add_feed(self, feed):
"""Ajoute un nouveau flux RSS"""
# Générer un ID unique
import uuid
feed['id'] = str(uuid.uuid4())[:8]
feed['enabled'] = feed.get('enabled', True)
self.feeds.append(feed)
self.save_config()
return feed
def update_feed(self, feed_id, updates):
"""Met à jour un flux RSS existant"""
for feed in self.feeds:
if feed.get('id') == feed_id:
feed.update(updates)
self.save_config()
return feed
return None
def delete_feed(self, feed_id):
"""Supprime un flux RSS"""
self.feeds = [f for f in self.feeds if f.get('id') != feed_id]
self.save_config()
return True
def test_feed(self, url, passkey='', use_flaresolverr=False, cookies=''):
"""Teste un flux RSS et retourne un aperçu"""
test_config = {
'url': url,
'name': 'Test',
'passkey': passkey,
'use_flaresolverr': use_flaresolverr,
'cookies': cookies
}
results = self.rss_source.fetch_feed(test_config, max_results=5)
return {
'success': len(results) > 0,
'count': len(results),
'sample': results[:3] if results else []
}
def fetch_latest(self, category, max_results=50):
"""Récupère les nouveautés de tous les flux RSS pour une catégorie"""
all_results = []
feeds = self.get_feeds_for_latest(category)
for feed in feeds:
try:
results = self.rss_source.fetch_feed(feed, max_results=max_results)
all_results.extend(results)
except Exception as e:
logger.error(f"❌ Erreur fetch RSS {feed.get('name')}: {e}")
# Trier par date (plus récent en premier)
all_results.sort(key=lambda x: x.get('PublishDateRaw', ''), reverse=True)
return all_results[:max_results]