From 56da34a5e264f3fe9b198f43a976cca43052c967 Mon Sep 17 00:00:00 2001 From: yma-het Date: Sat, 11 Jul 2015 23:15:45 +0300 Subject: [PATCH 1/3] added decoding layer for HTLM encoede unicode, tat comes from feedparser --- util.py | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 64 insertions(+), 1 deletion(-) diff --git a/util.py b/util.py index 61ff2a5..9cb12e1 100644 --- a/util.py +++ b/util.py @@ -8,6 +8,7 @@ import urlparse import threading import feedparser +from HTMLParser import HTMLParser from htmlentitydefs import name2codepoint from settings import settings @@ -75,13 +76,75 @@ def abspath(path): path = os.path.abspath(path) path = 'file:///%s' % path.replace('\\', '/') return path + +def unescHTMLSpcChr(str): + return HTMLParser().unescape(str) + +def unescapeRSSObject(fpDict): + ''' + This function tries to find all human readable + strings in dict, that has been returned by feedparser + and if string is in dict, replaces it with HTML + escaped symbols. + ''' + + if 'author' in fpDict: + fpDict['author'] = unescHTMLSpcChr(fpDict['author']) + + if 'author_detail' in fpDict: + if 'name' in fpDict['author_detail']: + fpDict['author_detail']['name'] = unescHTMLSpcChr(fpDict['author_detail']['name']) + + if 'comments' in fpDict: + fpDict['comments'] = unescHTMLSpcChr(fpDict['comments']) + + if 'content' in fpDict: + fpDict['content'] = unescHTMLSpcChr(fpDict['content']) + + if 'contributors' in fpDict: + fpDict['contributors'] = unescHTMLSpcChr(fpDict['contributors']) + + if 'summary' in fpDict: + fpDict['summary'] = unescHTMLSpcChr(fpDict['summary']) + + if 'summary_detail' in fpDict: + if 'value' in fpDict['summary_detail']: + fpDict['summary_detail']['value'] = unescHTMLSpcChr(fpDict['summary_detail']['value']) + + if 'summary_detail' in fpDict: + if 'value' in fpDict['summary_detail']: + fpDict['summary_detail']['value'] = unescHTMLSpcChr(fpDict['summary_detail']['value']) + + if 'tags' in fpDict: + for index, tag in enumerate(fpDict['tags']): + if 'term' in tag: + fpDict['tags'][index]['term'] = unescHTMLSpcChr(fpDict['tags'][index]['term']) + if 'label' in tag: + fpDict['tags'][index]['label'] = unescHTMLSpcChr(fpDict['tags'][index]['label']) + + if 'title' in fpDict: + fpDict['title'] = unescHTMLSpcChr(fpDict['title']) + + if 'title_detail' in fpDict: + if 'value' in fpDict['title_detail']: + fpDict['title_detail']['value'] = unescHTMLSpcChr(fpDict['title_detail']['value']) + + return fpDict + +def decodeRSS(rss): + for index, record in enumerate(rss): + rss[index] = unescapeRSSObject(record) + return rss def parse(url, username=None, password=None, etag=None, modified=None): agent = settings.USER_AGENT handlers = [get_proxy()] if username and password: url = insert_credentials(url, username, password) - return feedparser.parse(url, etag=etag, modified=modified, agent=agent, handlers=handlers) + response = feedparser.parse(url, etag=etag, modified=modified, agent=agent, handlers=handlers) + if "entries" in response: + response["entries"] = decodeRSS(response["entries"]) + return response def is_valid_feed(data): entries = get(data, 'entries', []) From 94da1276ee4cb8d60fb07613c2d315f5c9ac7e42 Mon Sep 17 00:00:00 2001 From: yma-het Date: Sun, 12 Jul 2015 22:24:39 +0300 Subject: [PATCH 2/3] small code refactoring and fixed incorrect behaviour with none type objects --- util.py | 81 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 41 insertions(+), 40 deletions(-) diff --git a/util.py b/util.py index 9cb12e1..6621179 100644 --- a/util.py +++ b/util.py @@ -9,6 +9,7 @@ import threading import feedparser from HTMLParser import HTMLParser +import operator from htmlentitydefs import name2codepoint from settings import settings @@ -80,6 +81,22 @@ def abspath(path): def unescHTMLSpcChr(str): return HTMLParser().unescape(str) +def keyExistsAndNotNull(fpElement, *keys): + if len(keys) == 1: + if keys[0] in fpElement: + if fpElement[keys[0]]: + return True + return False + if len(keys) == 2: + if keys[0] in fpElement: + if fpElement[keys[0]]: + if keys[1] in fpElement[keys[0]]: + if fpElement[keys[0]][keys[1]]: + return True + return False + else: + raise Exception("Unknown signature of doIfExists() call!") + def unescapeRSSObject(fpDict): ''' This function tries to find all human readable @@ -87,48 +104,32 @@ def unescapeRSSObject(fpDict): and if string is in dict, replaces it with HTML escaped symbols. ''' + unescapeToVar = lambda item, key: operator.setitem(item, key, unescHTMLSpcChr(item[key])) - if 'author' in fpDict: - fpDict['author'] = unescHTMLSpcChr(fpDict['author']) - - if 'author_detail' in fpDict: - if 'name' in fpDict['author_detail']: - fpDict['author_detail']['name'] = unescHTMLSpcChr(fpDict['author_detail']['name']) - - if 'comments' in fpDict: - fpDict['comments'] = unescHTMLSpcChr(fpDict['comments']) - - if 'content' in fpDict: - fpDict['content'] = unescHTMLSpcChr(fpDict['content']) - - if 'contributors' in fpDict: - fpDict['contributors'] = unescHTMLSpcChr(fpDict['contributors']) - - if 'summary' in fpDict: - fpDict['summary'] = unescHTMLSpcChr(fpDict['summary']) - - if 'summary_detail' in fpDict: - if 'value' in fpDict['summary_detail']: - fpDict['summary_detail']['value'] = unescHTMLSpcChr(fpDict['summary_detail']['value']) - - if 'summary_detail' in fpDict: - if 'value' in fpDict['summary_detail']: - fpDict['summary_detail']['value'] = unescHTMLSpcChr(fpDict['summary_detail']['value']) - - if 'tags' in fpDict: + if keyExistsAndNotNull(fpDict, 'author'): + unescapeToVar(fpDict, 'author') + if keyExistsAndNotNull(fpDict, 'author_detail', 'name'): + unescapeToVar(fpDict['author_detail'], 'name') + if keyExistsAndNotNull(fpDict, 'comments'): + unescapeToVar(fpDict, 'comments') + if keyExistsAndNotNull(fpDict, 'content'): + unescapeToVar(fpDict, 'content') + if keyExistsAndNotNull(fpDict, 'contributors'): + unescapeToVar(fpDict, 'contributors') + if keyExistsAndNotNull(fpDict, 'summary'): + unescapeToVar(fpDict, 'summary') + if keyExistsAndNotNull(fpDict, 'summary_detail', 'value'): + unescapeToVar(fpDict['summary_detail'], 'value') + if keyExistsAndNotNull(fpDict, 'tags'): for index, tag in enumerate(fpDict['tags']): - if 'term' in tag: - fpDict['tags'][index]['term'] = unescHTMLSpcChr(fpDict['tags'][index]['term']) - if 'label' in tag: - fpDict['tags'][index]['label'] = unescHTMLSpcChr(fpDict['tags'][index]['label']) - - if 'title' in fpDict: - fpDict['title'] = unescHTMLSpcChr(fpDict['title']) - - if 'title_detail' in fpDict: - if 'value' in fpDict['title_detail']: - fpDict['title_detail']['value'] = unescHTMLSpcChr(fpDict['title_detail']['value']) - + if keyExistsAndNotNull(tag, 'term'): + unescapeToVar(fpDict['tags'][index], 'term') + if keyExistsAndNotNull(tag, 'label'): + unescapeToVar(fpDict['tags'][index], 'label') + if keyExistsAndNotNull(fpDict, 'title'): + unescapeToVar(fpDict, 'title') + if keyExistsAndNotNull(fpDict, 'title_detail', 'value'): + unescapeToVar(fpDict['title_detail'], 'value') return fpDict def decodeRSS(rss): From afdde3e997187e25b7b7d55b2db2c6ed66bd2e7f Mon Sep 17 00:00:00 2001 From: yma-het Date: Fri, 17 Jul 2015 01:07:48 +0300 Subject: [PATCH 3/3] added support of filtering by category --- feeds.py | 2 ++ filters.py | 17 ++++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/feeds.py b/feeds.py index ef87bb2..2b340e8 100644 --- a/feeds.py +++ b/feeds.py @@ -29,6 +29,7 @@ def __init__(self, feed, id): self.description = '' self.link = '' self.author = '' + self.categories = [] self.read = False @property def time_since(self): @@ -136,6 +137,7 @@ def poll(self, timestamp, filters): item.description = util.format(util.get(entry, 'description', ''), settings.POPUP_BODY_LENGTH) item.link = util.get(entry, 'link', '') item.author = util.format(util.get(entry, 'author', '')) # TODO: max length + item.categories = util.get(entry, 'tags', []) if all(filter.filter(item) for filter in filters): result.append(item) self.clean_cache(settings.FEED_CACHE_SIZE) diff --git a/filters.py b/filters.py index 40da2d4..a13abdc 100644 --- a/filters.py +++ b/filters.py @@ -8,6 +8,7 @@ LINK = 2 AUTHOR = 4 CONTENT = 8 +CATEGORY = 16 TYPES = { None: INCLUDE, @@ -21,6 +22,7 @@ 'link:': LINK, 'author:': AUTHOR, 'content:': CONTENT, + 'category:': CATEGORY, } TYPE_STR = { @@ -34,6 +36,7 @@ LINK: 'link', AUTHOR: 'author', CONTENT: 'content', + CATEGORY: 'category', } class Rule(object): @@ -51,6 +54,12 @@ def evaluate(self, item, ignore_case=True, whole_word=True): strings.append(item.author) if self.qualifier & CONTENT: strings.append(item.description) + if self.qualifier & CATEGORY: + if item.categories: + for category_item in item.categories: + if category_item: + if 'term' in category_item: + strings.append(category_item['term']) text = '\n'.join(strings) word = self.word if ignore_case: @@ -113,6 +122,7 @@ def __str__(self): 'LINK', 'AUTHOR', 'CONTENT', + 'CATEGORY', 'WORD', ] + reserved.values() @@ -136,7 +146,11 @@ def t_AUTHOR(t): def t_CONTENT(t): r'content:' return t - + +def t_CATEGORY(t): + r'category:' + return t + def t_WORD(t): r'(\'[^\']+\') | (\"[^\"]+\") | ([^ \n\t\r+\-()\'"]+)' t.type = reserved.get(t.value, 'WORD') @@ -197,6 +211,7 @@ def p_qualifier(t): | LINK | AUTHOR | CONTENT + | CATEGORY | empty''' t[0] = t[1]