HackerClient/server.py at master · vendettacoder/HackerClient · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 17 17:20:34 2016

@author: Rohan Kulkarni
@email : rohan.kulkarni@columbia.edu

"""

from __future__ import print_function
import sys

from flask import Flask,render_template
from mechanize import Browser
from goose import Goose
from multiprocessing import Pool,cpu_count
import math
import praw
from espncricinfo.summary import Summary
from bs4 import BeautifulSoup
from urllib import urlopen


app = Flask(__name__,static_url_path='/static')


class HackerNews():
    def __init__(self,browser,goose):
        self.browser_obj = browser
        self.browser_obj.set_handle_robots(False)
        self.browser_obj.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
        self.browser_obj.open('https://news.ycombinator.com/')
        self.text_map = list()
        self.goose = goose

    def set_filters(self,filter_words):
        self.filters = filter_words

    def get_links(self):
        landing_page_links = list()
        for link in self.browser_obj.links(url_regex="^http{1}"):
            landing_page_links.append(link)
        self.news_links = landing_page_links

    def strip_inlinks(self):
        self.news_links=self.news_links[1:-2]

    def print_textmap(self):
        for i,tup in enumerate(self.text_map):
            print(tup,file=sys.stderr)

browser = Browser()
goose = Goose()
hn = HackerNews(browser,goose)

def extract_link(link):
    global hn
    global browser
    global goose
    try:
        browser.follow_link(link)
        url = browser.geturl()
        article = goose.extract(url=url)
        print(url, file=sys.stderr)
        browser.back()
        return (url,article.title,article.cleaned_text[:500])
    except:
        return None

def extract_reddit_link(x):
    return {'title':x.title,'url':x.url,'text':goose.extract(url=x.url).cleaned_text[:500]+'...'}

class AppStatus():
    def __init__(self,page_number):
        self.current_page=page_number

@app.route('/')
def loadInitialResults():
    hn.get_links()
    hn.strip_inlinks()
    numProc = cpu_count()*2
    pool = Pool(processes=numProc)
    initial_res = pool.map(extract_link,hn.news_links[:25])
    result = [x for x in initial_res if x is not None]
    news = [{'title':x[1],'url':x[0],'text':x[2]+'...'} for x in result]
    obj = dict()
    obj['link_data'] = news
    obj['num_pages'] = range(2,int(math.ceil((float(len(result))/10.0)+1)))
    return render_template('homepage.html',returnObj=obj)

@app.route('/reddit_page/')
def loadRedditResults():
    reddit = praw.Reddit(user_agent='rohan_news_client')
    submissions = reddit.get_subreddit('worldnews').get_hot(limit=15)
    numProc = cpu_count()*2
    pool = Pool(processes=numProc)
    news = pool.map(extract_reddit_link,submissions)
    obj = dict()
    obj['link_data'] = news
    obj['num_pages'] = range(2,int(math.ceil((float(len(news))/10.0)+1)))
    return render_template('homepage.html',returnObj=obj)

@app.route('/live_cricket/')
def loadCricketResults():
    s = Summary()
    obj = dict()
    match_list = list()
    for match in s.all_matches:
        match_list.append(match)
    obj['match_data'] = match_list#grouped_list
    return render_template('cricketpage.html',returnObj = obj)

@app.route('/live_football/')
def loadFootballResults():
    football_page = urlopen("http://www.livescores.com").read()
    football_soup=BeautifulSoup(football_page,'lxml')
    #teams = set(map(lambda x:x.lower(),["Arsenal","Chelsea","Liverpool","Manchester City","Manchester United","Tottenham","Napoli","Juventus","Inter","AC Milan","Barcelona","Athletico Madrid","Real Madrid","Bayern Munich","Borussia Dortmund","Bayer Leverkusen","Monaco","Paris Saint Germain","Marseille","Spain","Germany","Argentina","Colombia","Belgium","Uruguay","Switzerland","Netherlands","Italy","England","Brazil","Chile","United States","Portugal","Greece","Bosnia and Herzegovina","Ivory Coast","Croatia","Russia","Ukraine","Cote d'Ivoire"]))
    match_list = list()
    obj = dict()
    for data in football_soup.findAll('div',class_="row-gray"):
        data_string = data.text.strip()
        #if any(team.lower() in data_string.lower() for team in teams):
        match_list.append(data_string)
    obj['match_data'] = match_list
    return render_template('footballpage.html',returnObj = obj)


if __name__ == '__main__':
    app.debug = True
    app.run(host='localhost',port=8078)