Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified .DS_Store
Binary file not shown.
Binary file modified news_api/.DS_Store
Binary file not shown.
Binary file modified news_api/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file modified news_api/__pycache__/items.cpython-37.pyc
Binary file not shown.
Binary file modified news_api/__pycache__/items.cpython-38.pyc
Binary file not shown.
Binary file modified news_api/__pycache__/pipelines.cpython-37.pyc
Binary file not shown.
Binary file modified news_api/__pycache__/pipelines.cpython-38.pyc
Binary file not shown.
Binary file modified news_api/__pycache__/settings.cpython-37.pyc
Binary file not shown.
Binary file modified news_api/__pycache__/settings.cpython-38.pyc
Binary file not shown.
4 changes: 3 additions & 1 deletion news_api/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ class NewsApiItem(scrapy.Item):
description = scrapy.Field()
url = scrapy.Field()
publishedat = scrapy.Field()
content = scrapy.Field()
content = scrapy.Field()
sentiment = scrapy.Field()
keywords = scrapy.Field()

pass
11 changes: 5 additions & 6 deletions news_api/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,16 +12,15 @@
# import ipdb
# import pdb
import logging
from datetime import datetime
from datetime import datetime, timedelta

class NewsApiPipeline(object):
def process_item(self, item, spider):
return item

class MongoDBPipeline(object):

# pdb.set_trace()
today = 'newsAgg_' + datetime.strftime(datetime.now(), "%Y/%m/%d")
today = 'newsAgg_' + datetime.strftime((datetime.now() - timedelta(days=1)), "%Y%m%d")


def __init__(self, mongo_uri, mongo_db, mongo_collection):
Expand Down Expand Up @@ -52,16 +51,16 @@ def open_spider(self, spider):


def close_spider(self, spider):
# ## clean up when spider is closed
## clean up when spider is closed
self.client.close()

def process_item(self, item, spider):
## how to handle each post

# pdb.set_trace()

self.collection.insert(dict(item))

logging.debug("Post added to MongoDB")
return item

5 changes: 5 additions & 0 deletions news_api/settings.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
[PD]
exist = true
api_key = D3ZcPSa5zmgWQl4SRgmQa1jhAV9Cgi1P2BUQAFXHDKI
GOOGLE_APPLICATION_CREDENTIALS="/home/user/Downloads/service-account-file.json"

2 changes: 2 additions & 0 deletions news_api/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@

}



# Enable and configure the AutoThrottle extension (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
Expand Down
Binary file modified news_api/spiders/__pycache__/__init__.cpython-37.pyc
Binary file not shown.
Binary file modified news_api/spiders/__pycache__/news_agg.cpython-37.pyc
Binary file not shown.
Binary file modified news_api/spiders/__pycache__/news_agg.cpython-38.pyc
Binary file not shown.
75 changes: 60 additions & 15 deletions news_api/spiders/news_agg.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,80 @@
from scrapy import Spider
from newsapi import NewsApiClient
from news_api.items import NewsApiItem
from datetime import datetime
import time
from datetime import datetime, timedelta

import paralleldots

# Setting your API key
paralleldots.set_api_key("D3ZcPSa5zmgWQl4SRgmQa1jhAV9Cgi1P2BUQAFXHDKI")

# Viewing your API key
# paralleldots.get_api_key()

#API Key: 2f48e626e6bb43afa1d50e6a9cce7728



class NewsApiSpider(scrapy.Spider):

name = "newsagg"
headers = {'Connection': 'keep-alive','Cache-Control': 'max-age=0','DNT': '1','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36','Sec-Fetch-User': '?1','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3','Sec-Fetch-Site': 'same-origin','Sec-Fetch-Mode': 'navigate','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'en-US,en;q=0.9',}
newsapi = NewsApiClient(api_key='2f48e626e6bb43afa1d50e6a9cce7728')
today = datetime.strftime(datetime.now(), "%Y/%m/%d")
today = datetime.strftime((datetime.now() - timedelta(days=1)), "%Y/%m/%d")
keywordsToSearch = ["model s","model 3","model x","model y", "solar", "battery", "gigafactory", "autonomous driving","cybertruck", "elon musk", "ganization change", "stock price","bear market","china","conavirus","covid-19"]

def start_requests(self):
# pdb.set_trace()
urls = [
'http://newsapi.org/v2/everything?q=tesla&from=' + self.today + '&to=' + self.today + '&sortBy=popularity&apiKey=2f48e626e6bb43afa1d50e6a9cce7728'
'http://newsapi.org/v2/everything?q=tesla&from=' + self.today + '&to=' + self.today + '&sortBy=relevancy&language=en&apiKey=2f48e626e6bb43afa1d50e6a9cce7728'
]

for url in urls:
yield scrapy.Request(url=url, callback=self.parse, headers=self.headers)


def parse(self, response):
body = json.loads(response.body)

for value in body['articles']:
newsItem = NewsApiItem()
newsItem['publishedat'] = value['publishedAt']
newsItem['id'] = value['source']['id']
newsItem['name'] = value['source']['name']
newsItem['author'] = value['author']
newsItem['description'] = value['description']
newsItem['url'] = value['url']
newsItem['content'] = value['content']

yield newsItem

content = value['content']
description = value['description']

# if description is None:
# pdb.set_trace()
# if content is None:
# pdb.set_trace()

for keyword in self.keywordsToSearch:
# pdb.set_trace()
if ((description != None and content != None) and ((keyword in description.lower()) or (keyword in content.lower()))) :
# pdb.set_trace()
# if "model s" in content.lower() or "model 3" in content.lower() or "model x" in content.lower() or "model y" in content.lower() or "solar" in content.lower() or "battery" in content.lower() or "gigafactory" in content.lower() or "autonomous driving" in content.lower() or "cybertruck" in content.lower() or "elon musk" in content.lower() or "organization change" in content.lower() or "stock price" in content.lower() or "bear market" in content.lower() or "china" in content.lower() or "coronavirus" in content.lower() or "covid-19" in content.lower():
lang_code="en"
response=paralleldots.sentiment(value['description'], lang_code)
splitText=value['content'].split(". ")
keywords=paralleldots.batch_keywords(splitText)

# pdb.set_trace()

newsItem = NewsApiItem()

newsItem['publishedat'] = value['publishedAt']
newsItem['id'] = value['source']['id']
newsItem['name'] = value['source']['name']
newsItem['author'] = value['author']
newsItem['description'] = value['description']
newsItem['url'] = value['url']
newsItem['content'] = value['content']
newsItem['sentiment'] = response
newsItem['keywords'] = keywords


yield newsItem

else:
# pdb.set_trace()
print("no match found")


4 changes: 4 additions & 0 deletions news_api/spiders/settings.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[PD]
exist = true
api_key = D3ZcPSa5zmgWQl4SRgmQa1jhAV9Cgi1P2BUQAFXHDKI

4 changes: 4 additions & 0 deletions settings.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[PD]
exist = true
api_key = D3ZcPSa5zmgWQl4SRgmQa1jhAV9Cgi1P2BUQAFXHDKI