From 56c8287c00eab4c3b6673502d5b964a43d70ccb7 Mon Sep 17 00:00:00 2001 From: Vincent Le Gallic Date: Fri, 27 Sep 2013 16:36:25 +0200 Subject: [PATCH] =?utf8?q?Mise=20=C3=A0=20jour=20du=20flux=20RSS=20de=20dt?= =?utf8?q?c?= MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit --- dtc.py | 33 ++++----------------------------- fetch_dtc.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 29 deletions(-) create mode 100755 fetch_dtc.py diff --git a/dtc.py b/dtc.py index 826b2c0..d9cd465 100755 --- a/dtc.py +++ b/dtc.py @@ -2,13 +2,12 @@ # -*- coding:utf-8 -*- import sys -import html2text -import nltk -import re -import feedparser import psycopg2 import psycopg2.extras import json + +import fetch_dtc + VERBOSE = False def getcursor(): @@ -33,29 +32,6 @@ def last_inserted(): cur.execute("SELECT MAX(id) AS maxid FROM quotes;") return cur.fetchone()["maxid"] -def fetch_rss(): - """Récupère le flux RSS et le formate""" - flux = feedparser.parse('http://feeds.feedburner.com/bashfr-quotes') - quotes = [] - for q in flux["entries"]: - try: - id = int(q["title"]) - except ValueError: - print "FAILED : " + q["title"] - continue - date = q["published"] - quote = format(q["summary_detail"]["value"]) - quotes.append({"id" : id, "date" : date, "quote" : quote}) - return quotes - - -def format(quote): - """Dé-HTML-ise la quote""" - raw = html2text.unescape(nltk.clean_html(quote)) - # Fucking garbage - raw = re.sub(" Votez !$", "", raw) - return raw - def get_quotes(first, last=None): """Récupère des quotes dans la base.""" add = "" @@ -79,8 +55,7 @@ def display(liste): if __name__ == "__main__": import sys if sys.argv[1] in ["fetch", "update", "pull"]: - l = fetch_rss() - print l + l = fetch_dtc.getquotes() last = last_inserted() for q in l: if q["id"] > last: diff --git a/fetch_dtc.py b/fetch_dtc.py new file mode 100755 index 0000000..de84884 --- /dev/null +++ b/fetch_dtc.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python +# -*- encoding: utf-8 -*- + +"""Pour récupérer les quotes DTC à partir du flux RSS""" + +import BeautifulSoup +import feedparser +import re + +import html2text + +#: Regexp permettant de matcher l'id d'une quote et de virer le garbage qui traîne en bas +endquote_regexp = ur"\n\[#(\d+)\]\(http://danstonchat.com/\1\) - \[Voir les.commentaires\]\(http://danstonchat.com/\1#c\)" +endreg = re.compile(endquote_regexp, flags=re.DOTALL) + +def properquote(quote): + """Prend la quote tirée RSS et la renvoie dans un format lisible.""" + quote = str(quote).decode("utf-8") + quote = html2text.html2text(quote) + # Ce truc génère bêtement des lignes vides + quote = quote.replace(u"\n\n", u"\n") + return quote + +def parsequote(quote): + """Parse la quote. Renvoie ``(, )`` ou ``None`` si il n'y a pas d'id. + (C'est-à-dire, si ce n'est en fait pas une quote) + Renvoie aussi ``None`` sur un commentaire ``"_(A propos de)_"``""" + ids = endreg.findall(quote) + if quote.startswith(u"_(A propos de)_"): + return None + if ids: + return {"id" : int(ids[0]), "quote" : endreg.sub("", quote)} + else: + return None + +def getquotes(): + """Récupère toutes les quotes, avec date de publication et id.""" + flux = feedparser.parse("http://danstonchat.com/rss.xml") + allquotes = [] + for entry in flux["entries"]: + content = entry["summary"] + soup = BeautifulSoup.BeautifulSoup(content) + quotes = soup.findChildren(name="p") + quotes = [properquote(q) for q in quotes] + # Attention, en vrai il n'y a pas que des quotes là-dedans, + # mais aussi des top comments et autre random shit + # donc on vire les None + quotes = [parsequote(q) for q in quotes] + quotes = [q for q in quotes if not q is None] + [q.update({"date": entry["published"]}) for q in quotes] + allquotes.extend(quotes) + return allquotes -- 2.39.2