From 993a9e611f666bc559fd2d30e56d758e13d96445 Mon Sep 17 00:00:00 2001 From: Caleb Aguirre-Leon Date: Sat, 29 Mar 2025 00:01:21 -0400 Subject: [PATCH 01/15] Staging New File --- scraper.py | 84 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) create mode 100644 scraper.py diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..7dcfe1e --- /dev/null +++ b/scraper.py @@ -0,0 +1,84 @@ +""" +This module contains a function to search for posts using the BlueSky API. +""" +import requests +from alive_progress import alive_bar + +def search_posts(params, token): + # pylint: disable=E1102 + # pylint: disable=C0301 + + """ + Search for posts using the BlueSky API. + + Args: + params (dict): The query parameters for the API request. + - query (str, required): The search term for the BlueSky posts. + - sort (str, optional): The sorting criteria for results. + Options include "top" for top posts or "latest" for the latest posts. + - since (str, optional): The start date for posts (ISO 8601 format). + - until (str, optional): The end date for posts (ISO 8601 format). + - mentions (str, optional): Mentions to filter posts by. + - Handles will be resolved to DIDs using the provided token. + - author (str, optional): The author of the posts (handle or DID). + - lang (str, optional): The language of the posts. + - domain (str, optional): A domain URL included in the posts. + - url (str, optional): A specific URL included in the posts. + - tags (list, optional): Tags to filter posts by (each tag <= 640 characters). + - limit (int, optional): The maximum number of posts to retrieve in a single response. + Defaults to 25. + - cursor (str, optional): Pagination token for continuing from a previous request. + - posts_limit (int, optional): The maximum number of posts to retrieve across all responses. + Defaults to 500. + + Returns: + list: A list of posts matching the search criteria. + + Notes: + - Progress is displayed using a progress bar indicating the number of posts fetched. + - Handles pagination automatically until `posts_limit` is reached or no further results are available. + - Logs and returns partial results if an error occurs during fetching. + """ + posts = [] + url = "https://bsky.social/xrpc/app.bsky.feed.searchPosts" + headers = { + "Authorization": f"Bearer {token}", + "Content-Type": "application/json", + } + + total_fetched = 0 + posts_limit = params.get("posts_limit", 1000) + + with alive_bar(posts_limit) as progress: + while True: + try: + response = requests.get(url, headers=headers, params=params, timeout=10) + # print(response) + response.raise_for_status() + data = response.json() + + #Check if we have reached our overall posts limit + new_posts = data.get("posts", []) + posts.extend(new_posts) + total_fetched += len(new_posts) + + #Update progress bar + progress(len(new_posts)) + + if posts_limit and total_fetched >= posts_limit: + print(f"Fetched {total_fetched} posts, total: {total_fetched}/{posts_limit}") + return posts[:posts_limit] + + #Move to the enxt page if available + next_cursor = data.get("cursor") + if not next_cursor: + print(f"All posts fetched. Total: {total_fetched}") + return posts + + params["cursor"] = next_cursor + except requests.exceptions.RequestException as err: + print(f"Error fetching posts: {err}") + print( + "Response:", response.text if "response" in locals() else "No response" + ) + return posts \ No newline at end of file From 91889d86464c9d4ae514c5718b587d999b9a6ffc Mon Sep 17 00:00:00 2001 From: Caleb Aguirre-Leon Date: Sat, 29 Mar 2025 00:06:17 -0400 Subject: [PATCH 02/15] Staging Changes --- mission_blue.py | 87 ++----------------------------------------------- 1 file changed, 2 insertions(+), 85 deletions(-) diff --git a/mission_blue.py b/mission_blue.py index 32c7d0c..662f1cc 100644 --- a/mission_blue.py +++ b/mission_blue.py @@ -10,6 +10,7 @@ import click import file import auth +import scraper # pylint: disable=C0301 lang_dict = { @@ -217,90 +218,6 @@ def generate_query_params( } -def search_posts(params, token): - # pylint: disable=E1102 - # pylint: disable=C0301 - - """ - Search for posts using the BlueSky API. - - Args: - params (dict): The query parameters for the API request. - - query (str, required): The search term for the BlueSky posts. - - sort (str, optional): The sorting criteria for results. - Options include "top" for top posts or "latest" for the latest posts. - - since (str, optional): The start date for posts (ISO 8601 format). - - until (str, optional): The end date for posts (ISO 8601 format). - - mentions (str, optional): Mentions to filter posts by. - - Handles will be resolved to DIDs using the provided token. - - author (str, optional): The author of the posts (handle or DID). - - lang (str, optional): The language of the posts. - - domain (str, optional): A domain URL included in the posts. - - url (str, optional): A specific URL included in the posts. - - tags (list, optional): Tags to filter posts by (each tag <= 640 characters). - - limit (int, optional): The maximum number of posts to retrieve in a single response. - Defaults to 25. - - cursor (str, optional): Pagination token for continuing from a previous request. - - posts_limit (int, optional): The maximum number of posts to retrieve across all responses. - Defaults to 500. - - Returns: - list: A list of posts matching the search criteria. - - Notes: - - Progress is displayed using a progress bar indicating the number of posts fetched. - - Handles pagination automatically until `posts_limit` is reached or no further results are available. - - Logs and returns partial results if an error occurs during fetching. - """ - posts = [] - url = "https://bsky.social/xrpc/app.bsky.feed.searchPosts" - headers = { - "Authorization": f"Bearer {token}", - "Content-Type": "application/json", - } - - total_fetched = 0 - posts_limit = params.get("posts_limit") - butterfly_bar = bar_factory("✨", tip="🦋", errors="🔥🧯👩‍🚒") - - with alive_bar(posts_limit, bar=butterfly_bar, spinner="waves") as progress: - while True: - try: - response = requests.get(url, headers=headers, params=params, timeout=10) - # print(response) - response.raise_for_status() - data = response.json() - - # Check if we have reached our overall posts limit - new_posts = data.get("posts", []) - posts.extend(new_posts) - total_fetched += len(new_posts) - - # Update progress bar - progress(len(new_posts)) - - if posts_limit and total_fetched >= posts_limit: - print( - f"Fetched {total_fetched} posts, total: {total_fetched}/{posts_limit}" - ) - return posts[:posts_limit] - - # Move to the enxt page if available - next_cursor = data.get("cursor") - if not next_cursor: - print(f"All posts fetched. Total: {total_fetched}") - return posts - - params["cursor"] = next_cursor - except requests.exceptions.RequestException as err: - print(f"Error fetching posts: {err}") - print( - "Response:", - response.text if "response" in locals() else "No response", - ) - return posts - - # Begin Click CLI @@ -446,7 +363,7 @@ def main( # Fetch posts print("Fetching posts...") - raw_posts = search_posts(query_param, access_token) + raw_posts = scraper.search_posts(query_param, access_token) # Extract post data print("Extracting post data...") From 137762d3620176803b3875addb14309c80689574 Mon Sep 17 00:00:00 2001 From: Caleb Aguirre-Leon Date: Sat, 29 Mar 2025 00:12:24 -0400 Subject: [PATCH 03/15] Gaslight --- tests/testing | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 tests/testing diff --git a/tests/testing b/tests/testing deleted file mode 100644 index e69de29..0000000 From 8b30822253ecc75c268db233ec1ea6183065aa71 Mon Sep 17 00:00:00 2001 From: Caleb Aguirre-Leon Date: Wed, 14 May 2025 15:37:56 -0400 Subject: [PATCH 04/15] Add testing file --- tests/scraper_test.py | 68 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 tests/scraper_test.py diff --git a/tests/scraper_test.py b/tests/scraper_test.py new file mode 100644 index 0000000..a3458d0 --- /dev/null +++ b/tests/scraper_test.py @@ -0,0 +1,68 @@ +"""Testing suite for the mission_blue module.""" + +import unittest +from unittest.mock import Mock, patch + +from scraper import ( + search_posts, +) + + +class TestCase: + """Class used to store test data and expected results for the TestMissionBlue function.""" + + def __init__(self, data, expected_result): + self.data = data + self.expected_result = expected_result + + def get_data(self): + # pylint: disable=missing-function-docstring + return self.data + + def get_expected_result(self): + # pylint: disable=missing-function-docstring + return self.expected_result + + +class TestSearchPosts(unittest.TestCase): + """_summary_. + + Args: + unittest (_type_): _description_ + + """ + + def test_search_posts(self): + """Test case for the validate_url function. + This test verifies that the given url contains the correct post data. + Test data: + - Post Links with valid and invalid post urls. + - An expected result boolean. + Assertions: + - The result of validate_url(data) should match the expected_result. + """ + # If any of the test cases fail, try looking at the no_content_template variable + # within the validate_url function. + cases = { + "Post Exists": TestCase( + data="https://bsky.app/profile/witheringtales.bsky.social/post/3legkyuzjs22m", + expected_result=True, + ), + # If the test case fails, look at the validate_url function logic for guidance + # on how to fix the test case. + "Post Doesn't Exist": TestCase( + data="https://bsky.app/profile/witheringtales.bsky.social/post/3legkyuzjs22", + expected_result=False, + ), + } + + for case_name, case in cases.items(): + with self.subTest(case_name): + result = validate_url(case.get_data()) + self.assertEqual(result, case.get_expected_result()) + + + + +if __name__ == "__main__": + unittest.main() From b947ad964d5d3f82dfde0b9e14282d2778c61431 Mon Sep 17 00:00:00 2001 From: Caleb Aguirre-Leon Date: Thu, 15 May 2025 00:54:25 -0400 Subject: [PATCH 05/15] Small changes --- mission_blue.py | 2 -- tests/scraper_test.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/mission_blue.py b/mission_blue.py index 0d7e570..edd8cce 100644 --- a/mission_blue.py +++ b/mission_blue.py @@ -2,8 +2,6 @@ import click import requests -from alive_progress import alive_bar -from alive_progress.animations.bars import bar_factory import auth import scraper diff --git a/tests/scraper_test.py b/tests/scraper_test.py index a3458d0..1062567 100644 --- a/tests/scraper_test.py +++ b/tests/scraper_test.py @@ -62,7 +62,5 @@ def test_search_posts(self): self.assertEqual(result, case.get_expected_result()) - - if __name__ == "__main__": unittest.main() From a5d030bac0d70a223c5399b15ac0490fe383cf2b Mon Sep 17 00:00:00 2001 From: andewmark Date: Tue, 20 May 2025 10:59:36 -0400 Subject: [PATCH 06/15] Imported typing & Fixed Custom Progress Bar --- mission_blue.py | 1 + scraper.py | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/mission_blue.py b/mission_blue.py index 9680005..f809698 100644 --- a/mission_blue.py +++ b/mission_blue.py @@ -5,6 +5,7 @@ import auth import scraper import file +from typing import Optional, List, Dict, Any # pylint: disable=C0301 lang_dict = { diff --git a/scraper.py b/scraper.py index 7dcfe1e..ccb96ad 100644 --- a/scraper.py +++ b/scraper.py @@ -3,6 +3,7 @@ """ import requests from alive_progress import alive_bar +from alive_progress.animations.bars import bar_factory def search_posts(params, token): # pylint: disable=E1102 @@ -48,8 +49,9 @@ def search_posts(params, token): total_fetched = 0 posts_limit = params.get("posts_limit", 1000) + butterfly_bar = bar_factory("✨", tip="🦋", errors="🔥🧯👩‍🚒") - with alive_bar(posts_limit) as progress: + with alive_bar(posts_limit, bar=butterfly_bar, spinner="waves") as progress: while True: try: response = requests.get(url, headers=headers, params=params, timeout=10) From 1d1f0a7f16fa1a9e846f834ed24cf5b307726689 Mon Sep 17 00:00:00 2001 From: andewmark Date: Tue, 20 May 2025 11:01:47 -0400 Subject: [PATCH 07/15] Imported typing & Fixed Custom Progress Bar (Again) --- scraper.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scraper.py b/scraper.py index ccb96ad..974c2f7 100644 --- a/scraper.py +++ b/scraper.py @@ -55,7 +55,6 @@ def search_posts(params, token): while True: try: response = requests.get(url, headers=headers, params=params, timeout=10) - # print(response) response.raise_for_status() data = response.json() @@ -68,7 +67,9 @@ def search_posts(params, token): progress(len(new_posts)) if posts_limit and total_fetched >= posts_limit: - print(f"Fetched {total_fetched} posts, total: {total_fetched}/{posts_limit}") + print( + f"Fetched {total_fetched} posts, total: {total_fetched}/{posts_limit}" + ) return posts[:posts_limit] #Move to the enxt page if available From b298745730c02b8f31550512d038a39483fcb875 Mon Sep 17 00:00:00 2001 From: andewmark Date: Wed, 21 May 2025 13:57:46 -0400 Subject: [PATCH 08/15] imported correct methods into scraper_test.py --- mission_blue.py | 2 ++ scraper.py | 21 ++++++++++++--------- tests/scraper_test.py | 7 ++----- 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/mission_blue.py b/mission_blue.py index f809698..935e62e 100644 --- a/mission_blue.py +++ b/mission_blue.py @@ -6,6 +6,7 @@ import scraper import file from typing import Optional, List, Dict, Any + # pylint: disable=C0301 lang_dict = { @@ -210,6 +211,7 @@ def generate_query_params( "posts_limit": posts_limit, } + # Begin Click CLI diff --git a/scraper.py b/scraper.py index 974c2f7..9ea11ff 100644 --- a/scraper.py +++ b/scraper.py @@ -1,10 +1,12 @@ """ This module contains a function to search for posts using the BlueSky API. """ + import requests from alive_progress import alive_bar from alive_progress.animations.bars import bar_factory + def search_posts(params, token): # pylint: disable=E1102 # pylint: disable=C0301 @@ -15,22 +17,22 @@ def search_posts(params, token): Args: params (dict): The query parameters for the API request. - query (str, required): The search term for the BlueSky posts. - - sort (str, optional): The sorting criteria for results. + - sort (str, optional): The sorting criteria for results. Options include "top" for top posts or "latest" for the latest posts. - since (str, optional): The start date for posts (ISO 8601 format). - until (str, optional): The end date for posts (ISO 8601 format). - - mentions (str, optional): Mentions to filter posts by. + - mentions (str, optional): Mentions to filter posts by. - Handles will be resolved to DIDs using the provided token. - author (str, optional): The author of the posts (handle or DID). - lang (str, optional): The language of the posts. - domain (str, optional): A domain URL included in the posts. - url (str, optional): A specific URL included in the posts. - tags (list, optional): Tags to filter posts by (each tag <= 640 characters). - - limit (int, optional): The maximum number of posts to retrieve in a single response. + - limit (int, optional): The maximum number of posts to retrieve in a single response. Defaults to 25. - cursor (str, optional): Pagination token for continuing from a previous request. - posts_limit (int, optional): The maximum number of posts to retrieve across all responses. - Defaults to 500. + Defaults to 500. Returns: list: A list of posts matching the search criteria. @@ -58,12 +60,12 @@ def search_posts(params, token): response.raise_for_status() data = response.json() - #Check if we have reached our overall posts limit + # Check if we have reached our overall posts limit new_posts = data.get("posts", []) posts.extend(new_posts) total_fetched += len(new_posts) - #Update progress bar + # Update progress bar progress(len(new_posts)) if posts_limit and total_fetched >= posts_limit: @@ -72,7 +74,7 @@ def search_posts(params, token): ) return posts[:posts_limit] - #Move to the enxt page if available + # Move to the enxt page if available next_cursor = data.get("cursor") if not next_cursor: print(f"All posts fetched. Total: {total_fetched}") @@ -82,6 +84,7 @@ def search_posts(params, token): except requests.exceptions.RequestException as err: print(f"Error fetching posts: {err}") print( - "Response:", response.text if "response" in locals() else "No response" + "Response:", + response.text if "response" in locals() else "No response", ) - return posts \ No newline at end of file + return posts diff --git a/tests/scraper_test.py b/tests/scraper_test.py index 1062567..e84c53f 100644 --- a/tests/scraper_test.py +++ b/tests/scraper_test.py @@ -1,11 +1,8 @@ """Testing suite for the mission_blue module.""" import unittest -from unittest.mock import Mock, patch - -from scraper import ( - search_posts, -) +from scraper import search_posts +from file import validate_url class TestCase: From 81285c9c64d088c3624b5d56b71fcee363e888ca Mon Sep 17 00:00:00 2001 From: andewmark Date: Wed, 21 May 2025 19:16:10 -0400 Subject: [PATCH 09/15] Added tests & Updated Logic search_posts now ensures that query is within params (required) Test Cases validate this new logic --- scraper.py | 6 ++++ tests/scraper_test.py | 83 +++++++++++++++++++------------------------ 2 files changed, 42 insertions(+), 47 deletions(-) diff --git a/scraper.py b/scraper.py index 9ea11ff..0dd7636 100644 --- a/scraper.py +++ b/scraper.py @@ -42,6 +42,12 @@ def search_posts(params, token): - Handles pagination automatically until `posts_limit` is reached or no further results are available. - Logs and returns partial results if an error occurs during fetching. """ + # Validate input parameters + if "query" not in params: + raise ValueError("Query parameter is required.") + if not token: + raise ValueError("Token is required.") + posts = [] url = "https://bsky.social/xrpc/app.bsky.feed.searchPosts" headers = { diff --git a/tests/scraper_test.py b/tests/scraper_test.py index e84c53f..d921014 100644 --- a/tests/scraper_test.py +++ b/tests/scraper_test.py @@ -1,25 +1,8 @@ """Testing suite for the mission_blue module.""" import unittest +from unittest.mock import patch, Mock, MagicMock from scraper import search_posts -from file import validate_url - - -class TestCase: - """Class used to store test data and expected results for the TestMissionBlue function.""" - - def __init__(self, data, expected_result): - self.data = data - self.expected_result = expected_result - - def get_data(self): - # pylint: disable=missing-function-docstring - return self.data - - def get_expected_result(self): - # pylint: disable=missing-function-docstring - return self.expected_result - class TestSearchPosts(unittest.TestCase): """_summary_. @@ -28,35 +11,41 @@ class TestSearchPosts(unittest.TestCase): unittest (_type_): _description_ """ - - def test_search_posts(self): - """Test case for the validate_url function. - This test verifies that the given url contains the correct post data. - Test data: - - Post Links with valid and invalid post urls. - - An expected result boolean. - Assertions: - - The result of validate_url(data) should match the expected_result. - """ - # If any of the test cases fail, try looking at the no_content_template variable - # within the validate_url function. - cases = { - "Post Exists": TestCase( - data="https://bsky.app/profile/witheringtales.bsky.social/post/3legkyuzjs22m", - expected_result=True, - ), - # If the test case fails, look at the validate_url function logic for guidance - # on how to fix the test case. - "Post Doesn't Exist": TestCase( - data="https://bsky.app/profile/witheringtales.bsky.social/post/3legkyuzjs22", - expected_result=False, - ), - } - - for case_name, case in cases.items(): - with self.subTest(case_name): - result = validate_url(case.get_data()) - self.assertEqual(result, case.get_expected_result()) + # Dummy API data for testing + + @patch("scraper.requests.get") + def test_no_query(self, mock_get: MagicMock) -> None: + """Test if the function raises ValueError when a query is not provided.""" + params = {} + token = "valid_token" + + with self.assertRaises(ValueError) as cm: + search_posts(params, token) + + mock_get.assert_not_called() + self.assertIn("query", str(cm.exception).lower()) + + @patch("scraper.requests.get") + def test_no_token(self, mock_get: MagicMock) -> None: + """Test if the function raises ValueError when a token it not provided.""" + params = {"query": "test"} + token = None + + with self.assertRaises(ValueError) as cm: + search_posts(params, token) + + mock_get.assert_not_called() + self.assertIn("token", str(cm.exception).lower()) + + # Ensure that the function returns an empty list when no posts are found + + # Ensure that the function returns a list of posts when valid parameters are provided + + # Ensure that the function handles pagination correctly and returns all posts + + # Simulate a failed API response (e.g., 400: [InvalidRequest, ExpiredToken, InvalidToken, BadQueryString]) + + # Simulate a failed API response (401) if __name__ == "__main__": From 63b925f0a5f16b92564ba4eaa59853e8de16520d Mon Sep 17 00:00:00 2001 From: andewmark Date: Thu, 22 May 2025 01:20:54 -0400 Subject: [PATCH 10/15] added test --- scraper.py | 4 ++- tests/scraper_test.py | 60 +++++++++++++++++++++++++++++++++---------- 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/scraper.py b/scraper.py index 0dd7636..169c2be 100644 --- a/scraper.py +++ b/scraper.py @@ -3,6 +3,7 @@ """ import requests +import pprint from alive_progress import alive_bar from alive_progress.animations.bars import bar_factory @@ -43,7 +44,7 @@ def search_posts(params, token): - Logs and returns partial results if an error occurs during fetching. """ # Validate input parameters - if "query" not in params: + if "q" not in params: raise ValueError("Query parameter is required.") if not token: raise ValueError("Token is required.") @@ -78,6 +79,7 @@ def search_posts(params, token): print( f"Fetched {total_fetched} posts, total: {total_fetched}/{posts_limit}" ) + pprint.pprint(posts[0]) return posts[:posts_limit] # Move to the enxt page if available diff --git a/tests/scraper_test.py b/tests/scraper_test.py index d921014..91d93f9 100644 --- a/tests/scraper_test.py +++ b/tests/scraper_test.py @@ -1,18 +1,13 @@ """Testing suite for the mission_blue module.""" import unittest -from unittest.mock import patch, Mock, MagicMock +from unittest.mock import patch, MagicMock from scraper import search_posts -class TestSearchPosts(unittest.TestCase): - """_summary_. - Args: - unittest (_type_): _description_ +class TestSearchPosts(unittest.TestCase): + """Testing the search_posts() method.""" - """ - # Dummy API data for testing - @patch("scraper.requests.get") def test_no_query(self, mock_get: MagicMock) -> None: """Test if the function raises ValueError when a query is not provided.""" @@ -21,30 +16,67 @@ def test_no_query(self, mock_get: MagicMock) -> None: with self.assertRaises(ValueError) as cm: search_posts(params, token) - + mock_get.assert_not_called() self.assertIn("query", str(cm.exception).lower()) @patch("scraper.requests.get") def test_no_token(self, mock_get: MagicMock) -> None: """Test if the function raises ValueError when a token it not provided.""" - params = {"query": "test"} + params = {"q": "test"} token = None with self.assertRaises(ValueError) as cm: search_posts(params, token) - + mock_get.assert_not_called() self.assertIn("token", str(cm.exception).lower()) # Ensure that the function returns an empty list when no posts are found - # Ensure that the function returns a list of posts when valid parameters are provided + @patch("scraper.requests.get") + def test_valid_response(self, mock_get: MagicMock) -> None: + """Test that the function returns a list of posts when valid parameters are provided.""" + params = {"q": "test"} + token = "valid_token" + + mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.json.return_value = { + "posts": [ + { + "uri": "at://did:plc:12345/app.bsky.feed.post/abcdef", + "cid": "bafyre123...", + "author": { + "did": "did:plc:12345", + "handle": "author_handle", + "displayName": "Author Name", + }, + "record": { + "text": "Post content", + "createdAt": "2023-10-01T00:00:00Z", + "$type": "app.bsky.feed.post", + }, + "indexedAt": "2023-10-01T00:00:01Z", + } + ], + "cursor": None, + } + + mock_get.return_value = mock_response - # Ensure that the function handles pagination correctly and returns all posts + result = search_posts(params, token) + + self.assertEqual(len(result), 1) + self.assertEqual(result[0]["record"]["text"], "Post content") + self.assertEqual(result[0]["author"]["handle"], "author_handle") + self.assertEqual(result[0]["record"]["createdAt"], "2023-10-01T00:00:00Z") + self.assertEqual( + result[0]["uri"], "at://did:plc:12345/app.bsky.feed.post/abcdef" + ) # Simulate a failed API response (e.g., 400: [InvalidRequest, ExpiredToken, InvalidToken, BadQueryString]) - + # Simulate a failed API response (401) From ff94fd894bdb2e11220489f0c5b35be7c19e6465 Mon Sep 17 00:00:00 2001 From: andewmark Date: Thu, 22 May 2025 21:58:34 -0400 Subject: [PATCH 11/15] Added Simulated API response test case Verfied that search_posts() logic handles Client Error 400 correctly and gracefully --- scraper.py | 2 -- tests/scraper_test.py | 25 ++++++++++++++++++++++++- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/scraper.py b/scraper.py index 169c2be..7050a52 100644 --- a/scraper.py +++ b/scraper.py @@ -3,7 +3,6 @@ """ import requests -import pprint from alive_progress import alive_bar from alive_progress.animations.bars import bar_factory @@ -79,7 +78,6 @@ def search_posts(params, token): print( f"Fetched {total_fetched} posts, total: {total_fetched}/{posts_limit}" ) - pprint.pprint(posts[0]) return posts[:posts_limit] # Move to the enxt page if available diff --git a/tests/scraper_test.py b/tests/scraper_test.py index 91d93f9..2a531e7 100644 --- a/tests/scraper_test.py +++ b/tests/scraper_test.py @@ -1,6 +1,9 @@ """Testing suite for the mission_blue module.""" import unittest +import requests +import io +import sys from unittest.mock import patch, MagicMock from scraper import search_posts @@ -76,8 +79,28 @@ def test_valid_response(self, mock_get: MagicMock) -> None: ) # Simulate a failed API response (e.g., 400: [InvalidRequest, ExpiredToken, InvalidToken, BadQueryString]) + @patch("scraper.requests.get") + def test_invalid_request(self, mock_get: MagicMock) -> None: + """Test that the function handles invalid requests gracefully.""" + params = {"q": "test"} + token = "invalid_token" + + mock_response = MagicMock() + mock_response.status_code = 400 + mock_response.raise_for_status.side_effect = requests.exceptions.HTTPError( + "400 Client Error: InvalidToken" + ) + mock_get.return_value = mock_response + + # Redircting stdout to StringIO + captured_output = io.StringIO() + sys.stdout = captured_output + + result = search_posts(params, token) + sys.stdout = sys.__stdout__ - # Simulate a failed API response (401) + self.assertEqual(result, []) + self.assertIn("400 Client Error:", captured_output.getvalue()) if __name__ == "__main__": From 5098d24a32d49faecff57f9c2e559f7360616e1b Mon Sep 17 00:00:00 2001 From: andewmark Date: Wed, 4 Jun 2025 17:03:48 -0400 Subject: [PATCH 12/15] Resolved Nitpicks and suggested changes --- mission_blue.py | 2 +- scraper.py | 4 ++-- tests/scraper_test.py | 8 +++----- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/mission_blue.py b/mission_blue.py index 935e62e..4a0c338 100644 --- a/mission_blue.py +++ b/mission_blue.py @@ -1,4 +1,4 @@ -"""This module conatins the BlueSky Web Scrapper.""" +"""This module contains the BlueSky Web Scrapper.""" import click import requests diff --git a/scraper.py b/scraper.py index 7050a52..384c3dd 100644 --- a/scraper.py +++ b/scraper.py @@ -56,7 +56,7 @@ def search_posts(params, token): } total_fetched = 0 - posts_limit = params.get("posts_limit", 1000) + posts_limit = params.get("posts_limit", 500) butterfly_bar = bar_factory("✨", tip="🦋", errors="🔥🧯👩‍🚒") with alive_bar(posts_limit, bar=butterfly_bar, spinner="waves") as progress: @@ -80,7 +80,7 @@ def search_posts(params, token): ) return posts[:posts_limit] - # Move to the enxt page if available + # Move to the nextt page if available next_cursor = data.get("cursor") if not next_cursor: print(f"All posts fetched. Total: {total_fetched}") diff --git a/tests/scraper_test.py b/tests/scraper_test.py index 2a531e7..6be76f2 100644 --- a/tests/scraper_test.py +++ b/tests/scraper_test.py @@ -1,4 +1,4 @@ -"""Testing suite for the mission_blue module.""" +"""Testing suite for the scraper module.""" import unittest import requests @@ -25,7 +25,7 @@ def test_no_query(self, mock_get: MagicMock) -> None: @patch("scraper.requests.get") def test_no_token(self, mock_get: MagicMock) -> None: - """Test if the function raises ValueError when a token it not provided.""" + """Test if the function raises ValueError when a token is not provided.""" params = {"q": "test"} token = None @@ -35,8 +35,6 @@ def test_no_token(self, mock_get: MagicMock) -> None: mock_get.assert_not_called() self.assertIn("token", str(cm.exception).lower()) - # Ensure that the function returns an empty list when no posts are found - @patch("scraper.requests.get") def test_valid_response(self, mock_get: MagicMock) -> None: """Test that the function returns a list of posts when valid parameters are provided.""" @@ -92,7 +90,7 @@ def test_invalid_request(self, mock_get: MagicMock) -> None: ) mock_get.return_value = mock_response - # Redircting stdout to StringIO + # Redirecting stdout to StringIO captured_output = io.StringIO() sys.stdout = captured_output From f122fc5d62d483f21cc8b405a4f09951818c1a25 Mon Sep 17 00:00:00 2001 From: Caleb Aguirre-Leon Date: Wed, 4 Jun 2025 22:36:23 +0100 Subject: [PATCH 13/15] Update scraper.py Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> --- scraper.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scraper.py b/scraper.py index 384c3dd..a966251 100644 --- a/scraper.py +++ b/scraper.py @@ -72,7 +72,9 @@ def search_posts(params, token): total_fetched += len(new_posts) # Update progress bar - progress(len(new_posts)) + # Update progress bar + remaining = posts_limit - (total_fetched - len(new_posts)) + progress(min(len(new_posts), remaining)) if posts_limit and total_fetched >= posts_limit: print( From 0a1d4f4f2f30e87bc432d76aacdf41b2b578f38d Mon Sep 17 00:00:00 2001 From: Caleb Aguirre-Leon Date: Wed, 4 Jun 2025 22:47:56 +0100 Subject: [PATCH 14/15] Applied Recommended Changes from Coderabbit AI By making the clean it ensures that our scraper remains functional when scrapes get large. --- scraper.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/scraper.py b/scraper.py index a966251..1a01de9 100644 --- a/scraper.py +++ b/scraper.py @@ -76,13 +76,16 @@ def search_posts(params, token): remaining = posts_limit - (total_fetched - len(new_posts)) progress(min(len(new_posts), remaining)) - if posts_limit and total_fetched >= posts_limit: + if total_fetched >= posts_limit: print( f"Fetched {total_fetched} posts, total: {total_fetched}/{posts_limit}" ) - return posts[:posts_limit] + # Truncate only if we exceeded the limit + if len(posts) > posts_limit: + posts = posts[:posts_limit] + return posts - # Move to the nextt page if available + # Move to the next page if available next_cursor = data.get("cursor") if not next_cursor: print(f"All posts fetched. Total: {total_fetched}") From 3581b2eabfbcb3106b9b0f7f0a085799808e7bcc Mon Sep 17 00:00:00 2001 From: andewmark Date: Wed, 4 Jun 2025 19:18:20 -0400 Subject: [PATCH 15/15] Fixed default Values Posts_limit was still defaulted to 1000 even though docstring said 500. Other areas were the opposite. Everything now set to defaulted 500 --- mission_blue.py | 6 +++--- scraper.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/mission_blue.py b/mission_blue.py index 4a0c338..03b1ebe 100644 --- a/mission_blue.py +++ b/mission_blue.py @@ -304,10 +304,10 @@ def generate_query_params( "--posts_limit", type=click.IntRange(1, None), required=False, - default=1000, + default=500, help=( "Set the total number of posts to fetch from the API across all paginated responses. This value limits the total data retrieved " - "even if multiple API calls are required. If not specified, 1000 posts will be recieved." + "even if multiple API calls are required. If not specified, 500 posts will be recieved." ), ) def main( @@ -322,7 +322,7 @@ def main( url: str = "", tags: tuple = (), limit: int = 25, - posts_limit: int = 1000, + posts_limit: int = 500, ) -> None: """Method that tests if each click param flag is being passed in correctly.""" # pylint: disable=R0913 diff --git a/scraper.py b/scraper.py index 1a01de9..933878c 100644 --- a/scraper.py +++ b/scraper.py @@ -71,7 +71,6 @@ def search_posts(params, token): posts.extend(new_posts) total_fetched += len(new_posts) - # Update progress bar # Update progress bar remaining = posts_limit - (total_fetched - len(new_posts)) progress(min(len(new_posts), remaining))