diff --git a/.gitignore b/.gitignore deleted file mode 100644 index 2b544a7..0000000 --- a/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -config.py -.seen -*.pyc diff --git a/Dockerfile b/Dockerfile deleted file mode 100644 index cfbc0f4..0000000 --- a/Dockerfile +++ /dev/null @@ -1,9 +0,0 @@ -FROM python:2-alpine - -RUN pip install feedparser html2text - -COPY feed2mail.py config.py / - -ENV SEEN_FILE=/seen/seen - -CMD python2 feed2mail.py diff --git a/LICENSE b/LICENSE deleted file mode 100644 index 8ddf75d..0000000 --- a/LICENSE +++ /dev/null @@ -1,14 +0,0 @@ -https://github.com/jonashaag/feed2mail -Copyright (c) 2010-2013 Jonas Haag and contributors (see Git logs). - -Permission to use, copy, modify, and/or distribute this software for any -purpose with or without fee is hereby granted, provided that the above -copyright notice and this permission notice appear in all copies. - -THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR -ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN -ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF -OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. diff --git a/README.rst b/README.rst index f425bb2..97f5c57 100644 --- a/README.rst +++ b/README.rst @@ -1,38 +1,34 @@ -feed2mail +rss2email --------- rss2email done simple. Delivers news from feeds (RSS, Atom, ...) to your mail box. -How to install -~~~~~~~~~~~~~~ - -Simply check out the Git repository or download the Python file. - -**Docker**:: - - docker build -t feed2mail . +Digested HTML format -**Alternatively, manual virtualenv**:: +**Required Env** +~~~~~~~~~~~~~~ pip install html2text feedparser How to use it? ~~~~~~~~~~~~~~ -1. ``cp example_config.py config.py``. +1. ``cp -r rss2email.py config.py /data``. 2. Edit ``config.py``. -3. Run feed2mail every *N* seconds/hours/decades. For Docker setup:: - - docker run -v /path/to/your/seen/file:/seen feed2mail - - For manual virtualenv setup, simply run ``feed2mail.py``. +3. run in python ``python rss2email.py`` +4. Task: -I've found a bug! -~~~~~~~~~~~~~~~~~ -Great! `Please open a ticket`_. + ``vim crontab`` -.. _Please open a ticket: http://github.com/jonashaag/feed2mail/issues/ + ``minute hour day_of_month month day_of_week user_name cd ./location && python rss2email.py`` +RSS Link +~~~~~~~~~~~~~~ +Science: + https://www.science.org/action/showFeed?type=axatoc +Nature: + https://www.science.org/action/showFeed?type=axatoc + License? ~~~~~~~~ ISC diff --git a/config.py b/config.py new file mode 100644 index 0000000..e213975 --- /dev/null +++ b/config.py @@ -0,0 +1,15 @@ +SENDER_MAIL = 'Sender Name ' +RECIPIENT_MAIL = 'recipient_email@email.com' +SMTP_SERVER = 'smtp.mail.com' +SMTP_USE_TLS = True +SMTP_PORT = 587 # can be absent/set to None for the default value +SMTP_USERNAME = 'smtp_username@email.com' +SMTP_PASSWORD = 'smtp_password' +# A list of feeds to fetch. +# Items must be `(feed_url, group_name)` tuples. +# Entries of feeds that make a group won't be sent twice of they appear +# on multiple feeds (often seen on News Sites that offer topic feeds) +FEEDS = [ + ('https://www.science.org/action/showFeed?type=axatoc','Science'), + ('http://feeds.nature.com/nature/rss/current','Nature'), +] \ No newline at end of file diff --git a/data/Nature b/data/Nature new file mode 100644 index 0000000..502e17c Binary files /dev/null and b/data/Nature differ diff --git a/data/Science b/data/Science new file mode 100644 index 0000000..dc01bf2 Binary files /dev/null and b/data/Science differ diff --git a/example_config.py b/example_config.py deleted file mode 100644 index 9d06f27..0000000 --- a/example_config.py +++ /dev/null @@ -1,16 +0,0 @@ -SENDER_MAIL = 'send@er.tld' -RECIPIENT_MAIL = 'rec@ipient.tld' -SMTP_SERVER = 'ser.ver.tld' -SMTP_USE_TLS = False -SMTP_PORT = None # can be absent/set to None for the default value - -# A list of feeds to fetch. -# Items may be `(feed_url, group_name)` tuples. -# Entries of feeds that make a group won't be sent twice of they appear -# on multiple feeds (often seen on News Sites that offer topic feeds) -FEEDS = [ - 'http://foobar.org/feed.rss', - 'http://blah.com/feed.atom', - ('http://www.reddit.com/r/python/.rss', 'reddit'), - ('http://www.reddit.com/r/programming/.rss', 'reddit') -] diff --git a/feed2mail.py b/feed2mail.py deleted file mode 100644 index 41848c0..0000000 --- a/feed2mail.py +++ /dev/null @@ -1,274 +0,0 @@ -#!/usr/bin/env python - -""" Copyright 2010-2017 Jonas Haag . ISC-licensed. """ -import os -import sys -import time -import smtplib -import email, email.utils, email.mime.text -import pickle - -import feedparser -import html2text - -import config - - -def warn(feed_url, status, msg): - print >> sys.stderr, \ - 'WARNING: %s HTTP %d: %s' % (feed_url, status, msg) - -def log(msg): - print msg - - -class BufferedUnicode(object): - """ - Simple pseudo unicode string. StringIO wasn't worth importing. - >>> buf = BufferedUnicode() - >>> buf += 'hello' - >>> buf += u' world!' - >>> buf.as_unicode() - 'hello world!' - """ - def __init__(self): - self._buf = [] - - def __iadd__(self, other): - try: - self._buf.append(unicode(other)) - return self - except UnicodeDecodeError: - raise TypeError('Expected unicode') - - def as_unicode(self): - return u''.join(self._buf) - - -def my_html2text(s): - return html2text.html2text(s.replace('\n', ' ')) - - -def force_plaintext(element): - if 'html' in element.type: - return my_html2text(element.value) - return element.value - - -def fetch_entries(feed_url, seen_entries): - log('Fetching %r...' % feed_url) - feed = feedparser.parse(feed_url) - - if feed.bozo: - status = feed.get('status', 404) - if status != 200: - warn(feed_url, status, feed.bozo_exception) - if 400 <= status < 600: - return - - for entry in feed.entries: - if 'id' not in entry: - assert entry.link - entry.id = entry.link - if entry.id in seen_entries: - log('Already saw entry %r' % entry.id) - continue - log('Got new entry %r' % entry.id) - seen_entries.add(entry.id) - try: entry['feed_author'] = feed.feed['author'] - except KeyError: pass - try: entry['feed_title'] = feed.feed['title_detail'] - except KeyError: pass - yield entry - - -def select_plaintext_body(entry): - """ - Returns the first plaintext body that can be found in `entry`, - or the first HTML body converted to plaintext using ``html2text`` - of none was found. - - Returns ``None`` if no bodies are found at all. - """ - bodies = entry.get('content', []) + [entry.get('summary_detail')] - bodies = filter(None, bodies) - if not bodies: - return None - for body in bodies: - if body.type == 'text/plain': - return body.value - return my_html2text(bodies[0].value) - - -def select_plaintext_title(entry): - """ - Returns the entry's title, converted to plaintext if needed, - or ``None`` if no title is found. - """ - try: - return force_plaintext(entry['title_detail']) - except KeyError: - pass - - -def select_timestamp(entry): - """ - Returns the date and time `entry` was updated, published or created - (respectively) as a time-tuple. - """ - for attr in ('updated', 'published', 'created'): - try: - return entry['%s_parsed'] % attr - except KeyError: - pass - return time.gmtime() - - -def generate_mail_for_entry(entry): - # the entry's title: - title = select_plaintext_title(entry) - # the entry's content: - body = select_plaintext_body(entry) - # the entry's permalink - link = entry.get('link', entry.id) - # the date+time the entry was updated/published: - timestamp = select_timestamp(entry) - # the entry's feed's title: - feed_title = force_plaintext(entry.get('feed_title')) - # the entry's author: - author = (entry.get('author') and entry.get('author').strip() or None) - # the feed's author: - feed_author = entry.get('feed_author') - # files attached to the entry: - enclosures = entry.get('enclosures', []) - - subject, author, body = format_mail( - entry.id, - link, - title, - timestamp, - author, - body, - feed_title, - feed_author, - enclosures, - ) - - mail = email.mime.text.MIMEText( - body.encode('utf-8'), - 'plain', - 'utf-8' - ) - mail['To'] = config.RECIPIENT_MAIL - mail['Subject'] = author + ': ' + subject - mail['From'] = config.SENDER_MAIL - mail['Date'] = email.utils.formatdate(time.mktime(timestamp)) - mail['X-RSS-Entry-ID'] = entry.id - - return entry.id, mail.as_string().replace('\n', '\r\n') - - -def format_mail(id, link, title, timestamp, author, body, - feed_title, feed_author, enclosures): - """ - Returns a `(subject, author, body)` tuple, forming the mail's - Subject and From headers and the mail's body, respectively. - - All arguments passed expect for `id` and `timestamp` can be ``None``. - - The returned tuple's items *must* be strings (they can be empty, though). - """ - if not title: - if body: - title = body[:70] + '...' - else: - title = link - - if feed_title: - author = feed_title - else: - if not author: - author = feed_author or '' - - content = BufferedUnicode() - content += title + '\n' + (link or id) - if enclosures: - content += ' [%d enclosures]' % len(enclosures) - - if body: - content += '\n\n' - content += body - content += '\n' - - if enclosures: - content += '-' * 20 - for enclosure in enclosures: - try: - length = int(float(enclosure.length)) - except (ValueError, AttributeError): - length = -1 - content += '\nEnclosure: %s (%s, %d bytes)' \ - % (enclosure.href, enclosure.type, length) - - return title.strip(), author.strip(), content.as_unicode() - - -format_mail = getattr(config, 'format_mail', format_mail) - - -def main(): - SEEN_FILE = os.environ.get('SEEN_FILE', '.seen') - - if os.path.exists(SEEN_FILE): - with open(SEEN_FILE, 'r') as fobj: - seen = pickle.load(fobj) - else: - seen = {} - - mail_queue = [] - - for feed in config.FEEDS: - if isinstance(feed, (list, tuple)): - feed, feed_id = feed - else: - feed_id = feed - seen.setdefault(feed_id, set()) - for entry in fetch_entries(feed, seen[feed_id]): - mail_queue.append(generate_mail_for_entry(entry)) - - mails = len(mail_queue) - sent = error = 0 - if mails: - smtp_server = smtplib.SMTP( - config.SMTP_SERVER, - getattr(config, 'SMTP_PORT', None) - ) - if getattr(config, 'SMTP_USE_TLS', False): - smtp_server.starttls() - if hasattr(config, 'SMTP_USERNAME') or hasattr(config, 'SMTP_PASSWORD'): - smtp_server.login( - getattr(config, 'SMTP_USERNAME', None), - getattr(config, 'SMTP_PASSWORD', None) - ) - for entry_id, mail in mail_queue: - log('Sending mail for entry %r...' % entry_id) - try: - smtp_server.sendmail( - config.SENDER_MAIL, - config.RECIPIENT_MAIL, - mail, - ) - sent += 1 - except: - import traceback - traceback.print_exc() - error += 1 - - log('-' * 20) - log('Sent %d of %d mails (%d errors)' % (sent, mails, error)) - - with open(SEEN_FILE, 'w') as fobj: - pickle.dump(seen, fobj) - -if __name__ == '__main__': - main() diff --git a/rss2email.py b/rss2email.py new file mode 100644 index 0000000..37ad985 --- /dev/null +++ b/rss2email.py @@ -0,0 +1,127 @@ +import feedparser +import email, email.utils, email.mime.text +import smtplib +import html2text +import pickle +import config + +class BufferedUnicode(object): + """ + Simple pseudo unicode string. StringIO wasn't worth importing. + >>> buf = BufferedUnicode() + >>> buf += 'hello' + >>> buf += ' world!' + >>> buf.as_unicode() + 'hello world!' + """ + def __init__(self): + self._buf = [] + + def __iadd__(self, other): + try: + self._buf.append(str(other)) + return self + except UnicodeDecodeError: + raise TypeError('Expected unicode') + + def as_unicode(self): + return ''.join(self._buf) + +def news_content_form(news): + content = BufferedUnicode() + # news link + try: + content += ('' % news.link) + except: + content += ('' % news.id) + # news title + content += news.title + '
' + # news author + try: + content += ('Author: %s...
' % news.author[:40].replace('\n',' ')) + except: + pass + # news time + for time_news in ('updated', 'published', 'created'): + try: + content += ('%s time: %s
' % (time_news,news[time_news])) + break + except KeyError: + pass + # news summary + try: + content += ('%s
' % html2text.html2text(news.summary)) + except: + pass + # content break + content += '

' + return content.as_unicode() + +def mail_content_form(entry): + body = BufferedUnicode() + for news in entry: + body += news_content_form(news) + return body.as_unicode() + +def mail_form(entry,mail_title): + mail_body = mail_content_form(entry) + mail = email.mime.text.MIMEText( + mail_body.encode('utf-8'), + 'html', + 'utf-8' + ) + mail['To'] = config.RECIPIENT_MAIL + mail['Subject'] = mail_title + mail['From'] = config.SENDER_MAIL + mail['Date'] = email.utils.formatdate(localtime=True) + return mail.as_string() + +def data_compare(feed_content,mail_title): + try: + with open('./data/%s' % mail_title,'rb') as file: + pre_data = pickle.load(file) + except: + with open('./data/%s' % mail_title,'wb') as file: + pickle.dump(feed_content,file) + return feed_content + if sorted(pre_data[0]) == sorted(feed_content[0]): + print('[Warning] No update. Would not send new email.') + return None + else: + with open('./data/%s' % mail_title,'wb') as file: + pickle.dump(feed_content,file) + return feed_content + +def main(): + smtp_server = smtplib.SMTP( + config.SMTP_SERVER, + getattr(config, 'SMTP_PORT', None) + ) + if getattr(config, 'SMTP_USE_TLS', False): + smtp_server.starttls() + if hasattr(config, 'SMTP_USERNAME') or hasattr(config, 'SMTP_PASSWORD'): + smtp_server.login( + getattr(config, 'SMTP_USERNAME', None), + getattr(config, 'SMTP_PASSWORD', None) + ) + for feed in config.FEEDS: + feed_url, mail_title = feed + print('[Info] Fetching %s: %s' % (mail_title,feed_url)) + feed_content = feedparser.parse(feed_url).entries + feed_content = data_compare(feed_content,mail_title) + if feed_content: + mail_content = mail_form(feed_content,mail_title) + try: + print('[Info] Email to %s from %s' % (config.RECIPIENT_MAIL,config.SENDER_MAIL)) + smtp_server.sendmail( + config.SENDER_MAIL, + config.RECIPIENT_MAIL, + mail_content, + ) + except: + import traceback + traceback.print_exc() + print('[Info] Finish') + +if __name__ == '__main__': + main() \ No newline at end of file