diff --git a/.DS_Store b/.DS_Store index 3c65761..607b25d 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/news_api/.DS_Store b/news_api/.DS_Store index 93f448e..86f798c 100644 Binary files a/news_api/.DS_Store and b/news_api/.DS_Store differ diff --git a/news_api/__pycache__/__init__.cpython-37.pyc b/news_api/__pycache__/__init__.cpython-37.pyc index 1456772..7e4c28a 100644 Binary files a/news_api/__pycache__/__init__.cpython-37.pyc and b/news_api/__pycache__/__init__.cpython-37.pyc differ diff --git a/news_api/__pycache__/items.cpython-37.pyc b/news_api/__pycache__/items.cpython-37.pyc index cd1b14a..ec378ea 100644 Binary files a/news_api/__pycache__/items.cpython-37.pyc and b/news_api/__pycache__/items.cpython-37.pyc differ diff --git a/news_api/__pycache__/items.cpython-38.pyc b/news_api/__pycache__/items.cpython-38.pyc index 1f2c692..d19e7a5 100644 Binary files a/news_api/__pycache__/items.cpython-38.pyc and b/news_api/__pycache__/items.cpython-38.pyc differ diff --git a/news_api/__pycache__/pipelines.cpython-37.pyc b/news_api/__pycache__/pipelines.cpython-37.pyc index 4a2a1db..8334204 100644 Binary files a/news_api/__pycache__/pipelines.cpython-37.pyc and b/news_api/__pycache__/pipelines.cpython-37.pyc differ diff --git a/news_api/__pycache__/pipelines.cpython-38.pyc b/news_api/__pycache__/pipelines.cpython-38.pyc index 5ebffe3..7a2ab95 100644 Binary files a/news_api/__pycache__/pipelines.cpython-38.pyc and b/news_api/__pycache__/pipelines.cpython-38.pyc differ diff --git a/news_api/__pycache__/settings.cpython-37.pyc b/news_api/__pycache__/settings.cpython-37.pyc index b50ae65..0b27a6d 100644 Binary files a/news_api/__pycache__/settings.cpython-37.pyc and b/news_api/__pycache__/settings.cpython-37.pyc differ diff --git a/news_api/__pycache__/settings.cpython-38.pyc b/news_api/__pycache__/settings.cpython-38.pyc index 397b5d2..69ff824 100644 Binary files a/news_api/__pycache__/settings.cpython-38.pyc and b/news_api/__pycache__/settings.cpython-38.pyc differ diff --git a/news_api/items.py b/news_api/items.py index c5cb08a..6a6313d 100644 --- a/news_api/items.py +++ b/news_api/items.py @@ -18,6 +18,8 @@ class NewsApiItem(scrapy.Item): description = scrapy.Field() url = scrapy.Field() publishedat = scrapy.Field() - content = scrapy.Field() + content = scrapy.Field() + sentiment = scrapy.Field() + keywords = scrapy.Field() pass \ No newline at end of file diff --git a/news_api/pipelines.py b/news_api/pipelines.py index e64be7a..b224dba 100644 --- a/news_api/pipelines.py +++ b/news_api/pipelines.py @@ -12,16 +12,15 @@ # import ipdb # import pdb import logging -from datetime import datetime +from datetime import datetime, timedelta class NewsApiPipeline(object): def process_item(self, item, spider): return item class MongoDBPipeline(object): - # pdb.set_trace() - today = 'newsAgg_' + datetime.strftime(datetime.now(), "%Y/%m/%d") + today = 'newsAgg_' + datetime.strftime((datetime.now() - timedelta(days=1)), "%Y%m%d") def __init__(self, mongo_uri, mongo_db, mongo_collection): @@ -52,16 +51,16 @@ def open_spider(self, spider): def close_spider(self, spider): - # ## clean up when spider is closed + ## clean up when spider is closed self.client.close() def process_item(self, item, spider): ## how to handle each post - + # pdb.set_trace() self.collection.insert(dict(item)) - + logging.debug("Post added to MongoDB") return item diff --git a/news_api/settings.cfg b/news_api/settings.cfg new file mode 100644 index 0000000..1ee6354 --- /dev/null +++ b/news_api/settings.cfg @@ -0,0 +1,5 @@ +[PD] +exist = true +api_key = D3ZcPSa5zmgWQl4SRgmQa1jhAV9Cgi1P2BUQAFXHDKI +GOOGLE_APPLICATION_CREDENTIALS="/home/user/Downloads/service-account-file.json" + diff --git a/news_api/settings.py b/news_api/settings.py index e355d84..79eae25 100644 --- a/news_api/settings.py +++ b/news_api/settings.py @@ -70,6 +70,8 @@ } + + # Enable and configure the AutoThrottle extension (disabled by default) # See https://docs.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True diff --git a/news_api/spiders/__pycache__/__init__.cpython-37.pyc b/news_api/spiders/__pycache__/__init__.cpython-37.pyc index 1a91888..2d919ac 100644 Binary files a/news_api/spiders/__pycache__/__init__.cpython-37.pyc and b/news_api/spiders/__pycache__/__init__.cpython-37.pyc differ diff --git a/news_api/spiders/__pycache__/news_agg.cpython-37.pyc b/news_api/spiders/__pycache__/news_agg.cpython-37.pyc index 2353b3a..fb79302 100644 Binary files a/news_api/spiders/__pycache__/news_agg.cpython-37.pyc and b/news_api/spiders/__pycache__/news_agg.cpython-37.pyc differ diff --git a/news_api/spiders/__pycache__/news_agg.cpython-38.pyc b/news_api/spiders/__pycache__/news_agg.cpython-38.pyc index 7cefe63..d95c5ad 100644 Binary files a/news_api/spiders/__pycache__/news_agg.cpython-38.pyc and b/news_api/spiders/__pycache__/news_agg.cpython-38.pyc differ diff --git a/news_api/spiders/news_agg.py b/news_api/spiders/news_agg.py index bedb77f..ec3f503 100644 --- a/news_api/spiders/news_agg.py +++ b/news_api/spiders/news_agg.py @@ -5,35 +5,80 @@ from scrapy import Spider from newsapi import NewsApiClient from news_api.items import NewsApiItem -from datetime import datetime +import time +from datetime import datetime, timedelta + +import paralleldots + +# Setting your API key +paralleldots.set_api_key("D3ZcPSa5zmgWQl4SRgmQa1jhAV9Cgi1P2BUQAFXHDKI") + +# Viewing your API key +# paralleldots.get_api_key() #API Key: 2f48e626e6bb43afa1d50e6a9cce7728 + + class NewsApiSpider(scrapy.Spider): + name = "newsagg" headers = {'Connection': 'keep-alive','Cache-Control': 'max-age=0','DNT': '1','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36','Sec-Fetch-User': '?1','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3','Sec-Fetch-Site': 'same-origin','Sec-Fetch-Mode': 'navigate','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'en-US,en;q=0.9',} newsapi = NewsApiClient(api_key='2f48e626e6bb43afa1d50e6a9cce7728') - today = datetime.strftime(datetime.now(), "%Y/%m/%d") + today = datetime.strftime((datetime.now() - timedelta(days=1)), "%Y/%m/%d") + keywordsToSearch = ["model s","model 3","model x","model y", "solar", "battery", "gigafactory", "autonomous driving","cybertruck", "elon musk", "ganization change", "stock price","bear market","china","conavirus","covid-19"] + def start_requests(self): + # pdb.set_trace() urls = [ - 'http://newsapi.org/v2/everything?q=tesla&from=' + self.today + '&to=' + self.today + '&sortBy=popularity&apiKey=2f48e626e6bb43afa1d50e6a9cce7728' + 'http://newsapi.org/v2/everything?q=tesla&from=' + self.today + '&to=' + self.today + '&sortBy=relevancy&language=en&apiKey=2f48e626e6bb43afa1d50e6a9cce7728' ] for url in urls: yield scrapy.Request(url=url, callback=self.parse, headers=self.headers) - def parse(self, response): body = json.loads(response.body) - + for value in body['articles']: - newsItem = NewsApiItem() - newsItem['publishedat'] = value['publishedAt'] - newsItem['id'] = value['source']['id'] - newsItem['name'] = value['source']['name'] - newsItem['author'] = value['author'] - newsItem['description'] = value['description'] - newsItem['url'] = value['url'] - newsItem['content'] = value['content'] - - yield newsItem + + content = value['content'] + description = value['description'] + + # if description is None: + # pdb.set_trace() + # if content is None: + # pdb.set_trace() + + for keyword in self.keywordsToSearch: + # pdb.set_trace() + if ((description != None and content != None) and ((keyword in description.lower()) or (keyword in content.lower()))) : + # pdb.set_trace() + # if "model s" in content.lower() or "model 3" in content.lower() or "model x" in content.lower() or "model y" in content.lower() or "solar" in content.lower() or "battery" in content.lower() or "gigafactory" in content.lower() or "autonomous driving" in content.lower() or "cybertruck" in content.lower() or "elon musk" in content.lower() or "organization change" in content.lower() or "stock price" in content.lower() or "bear market" in content.lower() or "china" in content.lower() or "coronavirus" in content.lower() or "covid-19" in content.lower(): + lang_code="en" + response=paralleldots.sentiment(value['description'], lang_code) + splitText=value['content'].split(". ") + keywords=paralleldots.batch_keywords(splitText) + + # pdb.set_trace() + + newsItem = NewsApiItem() + + newsItem['publishedat'] = value['publishedAt'] + newsItem['id'] = value['source']['id'] + newsItem['name'] = value['source']['name'] + newsItem['author'] = value['author'] + newsItem['description'] = value['description'] + newsItem['url'] = value['url'] + newsItem['content'] = value['content'] + newsItem['sentiment'] = response + newsItem['keywords'] = keywords + + + yield newsItem + + else: + # pdb.set_trace() + print("no match found") + + diff --git a/news_api/spiders/settings.cfg b/news_api/spiders/settings.cfg new file mode 100644 index 0000000..99b25bc --- /dev/null +++ b/news_api/spiders/settings.cfg @@ -0,0 +1,4 @@ +[PD] +exist = true +api_key = D3ZcPSa5zmgWQl4SRgmQa1jhAV9Cgi1P2BUQAFXHDKI + diff --git a/settings.cfg b/settings.cfg new file mode 100644 index 0000000..99b25bc --- /dev/null +++ b/settings.cfg @@ -0,0 +1,4 @@ +[PD] +exist = true +api_key = D3ZcPSa5zmgWQl4SRgmQa1jhAV9Cgi1P2BUQAFXHDKI +