jlk31991 · jlk31991 · Mar 10, 2020 · Mar 22, 2020 · Jun 8, 2020 · Jun 10, 2020
diff --git a/.DS_Store b/.DS_Store
diff --git a/news_api/.DS_Store b/news_api/.DS_Store
diff --git a/news_api/__pycache__/__init__.cpython-37.pyc b/news_api/__pycache__/__init__.cpython-37.pyc
diff --git a/news_api/__pycache__/items.cpython-37.pyc b/news_api/__pycache__/items.cpython-37.pyc
diff --git a/news_api/__pycache__/items.cpython-38.pyc b/news_api/__pycache__/items.cpython-38.pyc
diff --git a/news_api/__pycache__/pipelines.cpython-37.pyc b/news_api/__pycache__/pipelines.cpython-37.pyc
diff --git a/news_api/__pycache__/pipelines.cpython-38.pyc b/news_api/__pycache__/pipelines.cpython-38.pyc
diff --git a/news_api/__pycache__/settings.cpython-37.pyc b/news_api/__pycache__/settings.cpython-37.pyc
diff --git a/news_api/__pycache__/settings.cpython-38.pyc b/news_api/__pycache__/settings.cpython-38.pyc
diff --git a/news_api/items.py b/news_api/items.py
@@ -18,6 +18,8 @@ class NewsApiItem(scrapy.Item):
     description = scrapy.Field()
     url = scrapy.Field()
     publishedat = scrapy.Field()
-    content = scrapy.Field()    
+    content = scrapy.Field() 
+    sentiment = scrapy.Field()   
+    keywords = scrapy.Field()
 
     pass
diff --git a/news_api/pipelines.py b/news_api/pipelines.py
@@ -12,16 +12,15 @@
 # import ipdb
 # import pdb
 import logging
-from datetime import datetime
+from datetime import datetime, timedelta
 
 class NewsApiPipeline(object):
     def process_item(self, item, spider):
         return item
 
 class MongoDBPipeline(object):
-
     # pdb.set_trace()
-    today = 'newsAgg_' + datetime.strftime(datetime.now(), "%Y/%m/%d") 
+    today = 'newsAgg_' + datetime.strftime((datetime.now() - timedelta(days=1)), "%Y%m%d")
 
 
     def __init__(self, mongo_uri, mongo_db, mongo_collection):
@@ -52,16 +51,16 @@ def open_spider(self, spider):
 
 
     def close_spider(self, spider):
-    #     ## clean up when spider is closed
+        ## clean up when spider is closed
         self.client.close()
 
     def process_item(self, item, spider):
         ## how to handle each post
-
+        
         # pdb.set_trace()
 
         self.collection.insert(dict(item))
-
+        
         logging.debug("Post added to MongoDB")
         return item
 
diff --git a/news_api/settings.cfg b/news_api/settings.cfg
@@ -0,0 +1,5 @@
+[PD]
+exist = true
+api_key = D3ZcPSa5zmgWQl4SRgmQa1jhAV9Cgi1P2BUQAFXHDKI
+GOOGLE_APPLICATION_CREDENTIALS="/home/user/Downloads/service-account-file.json"
+
diff --git a/news_api/settings.py b/news_api/settings.py
@@ -70,6 +70,8 @@
 
 }
 
+
+
 # Enable and configure the AutoThrottle extension (disabled by default)
 # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
 #AUTOTHROTTLE_ENABLED = True

diff --git a/news_api/spiders/__pycache__/__init__.cpython-37.pyc b/news_api/spiders/__pycache__/__init__.cpython-37.pyc
diff --git a/news_api/spiders/__pycache__/news_agg.cpython-37.pyc b/news_api/spiders/__pycache__/news_agg.cpython-37.pyc
diff --git a/news_api/spiders/__pycache__/news_agg.cpython-38.pyc b/news_api/spiders/__pycache__/news_agg.cpython-38.pyc
diff --git a/news_api/spiders/news_agg.py b/news_api/spiders/news_agg.py
@@ -5,35 +5,80 @@
 from scrapy import Spider
 from newsapi import NewsApiClient
 from news_api.items import NewsApiItem
-from datetime import datetime
+import time
+from datetime import datetime, timedelta
+
+import paralleldots
+
+# Setting your API key
+paralleldots.set_api_key("D3ZcPSa5zmgWQl4SRgmQa1jhAV9Cgi1P2BUQAFXHDKI")
+
+# Viewing your API key
+# paralleldots.get_api_key()
 
 #API Key: 2f48e626e6bb43afa1d50e6a9cce7728
 
+
+
 class NewsApiSpider(scrapy.Spider):
+
     name = "newsagg"
     headers = {'Connection': 'keep-alive','Cache-Control': 'max-age=0','DNT': '1','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36','Sec-Fetch-User': '?1','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3','Sec-Fetch-Site': 'same-origin','Sec-Fetch-Mode': 'navigate','Accept-Encoding': 'gzip, deflate, br','Accept-Language': 'en-US,en;q=0.9',}
     newsapi = NewsApiClient(api_key='2f48e626e6bb43afa1d50e6a9cce7728')
-    today = datetime.strftime(datetime.now(), "%Y/%m/%d")
+    today = datetime.strftime((datetime.now() - timedelta(days=1)), "%Y/%m/%d")
+    keywordsToSearch = ["model s","model 3","model x","model y", "solar", "battery", "gigafactory", "autonomous driving","cybertruck", "elon musk", "ganization change", "stock price","bear market","china","conavirus","covid-19"]
+
     def start_requests(self):
+        # pdb.set_trace()
         urls = [
-           'http://newsapi.org/v2/everything?q=tesla&from=' + self.today + '&to=' + self.today + '&sortBy=popularity&apiKey=2f48e626e6bb43afa1d50e6a9cce7728'
+           'http://newsapi.org/v2/everything?q=tesla&from=' + self.today + '&to=' + self.today + '&sortBy=relevancy&language=en&apiKey=2f48e626e6bb43afa1d50e6a9cce7728'
            ]
 
         for url in urls:
            yield scrapy.Request(url=url, callback=self.parse, headers=self.headers)
 
-
     def parse(self, response):
         body = json.loads(response.body)
-        
+
         for value in body['articles']:
-            newsItem = NewsApiItem()
-            newsItem['publishedat'] = value['publishedAt']
-            newsItem['id'] = value['source']['id']
-            newsItem['name'] = value['source']['name']
-            newsItem['author'] = value['author']
-            newsItem['description'] = value['description']
-            newsItem['url'] = value['url']
-            newsItem['content'] = value['content']
-
-            yield newsItem
+
+            content = value['content']
+            description = value['description']
+
+            # if description is None:
+            #     pdb.set_trace()
+            # if content is None:
+            #     pdb.set_trace()
+
+            for keyword in self.keywordsToSearch:
+                # pdb.set_trace()
+                if  ((description != None and content != None) and ((keyword in description.lower()) or (keyword in content.lower()))) :
+                # pdb.set_trace()
+                # if "model s" in content.lower() or "model 3" in content.lower() or "model x" in content.lower() or "model y" in content.lower() or "solar" in content.lower() or "battery" in content.lower() or "gigafactory" in content.lower() or "autonomous driving" in content.lower() or "cybertruck" in content.lower() or "elon musk" in content.lower() or "organization change" in content.lower() or "stock price" in content.lower() or "bear market" in content.lower() or "china" in content.lower() or "coronavirus" in content.lower() or "covid-19" in content.lower():
+                        lang_code="en"
+                        response=paralleldots.sentiment(value['description'], lang_code)
+                        splitText=value['content'].split(". ")
+                        keywords=paralleldots.batch_keywords(splitText)
+
+                        # pdb.set_trace()
+
+                        newsItem = NewsApiItem()
+
+                        newsItem['publishedat'] = value['publishedAt']
+                        newsItem['id'] = value['source']['id']
+                        newsItem['name'] = value['source']['name']
+                        newsItem['author'] = value['author']
+                        newsItem['description'] = value['description']
+                        newsItem['url'] = value['url']
+                        newsItem['content'] = value['content']
+                        newsItem['sentiment'] = response
+                        newsItem['keywords'] = keywords
+
+
+                        yield newsItem
+
+                else:
+                    # pdb.set_trace()
+                    print("no match found")
+
+
diff --git a/news_api/spiders/settings.cfg b/news_api/spiders/settings.cfg
@@ -0,0 +1,4 @@
+[PD]
+exist = true
+api_key = D3ZcPSa5zmgWQl4SRgmQa1jhAV9Cgi1P2BUQAFXHDKI
+
diff --git a/settings.cfg b/settings.cfg
@@ -0,0 +1,4 @@
+[PD]
+exist = true
+api_key = D3ZcPSa5zmgWQl4SRgmQa1jhAV9Cgi1P2BUQAFXHDKI
+
-Original file line number
+Diff line change
@@ Expand Up / @@ -70,6 +70,8 @@ @@
     }
     # Enable and configure the AutoThrottle extension (disabled by default)
     # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
     #AUTOTHROTTLE_ENABLED = True
@@ Expand Down @@