diff --git a/.github/scripts/update_version.py b/.github/scripts/update_version.py index 89f7fd9..085deb4 100644 --- a/.github/scripts/update_version.py +++ b/.github/scripts/update_version.py @@ -1,6 +1,7 @@ import os import re import sys +from pathlib import Path def bump_version(version: str, bump_type: str) -> str: @@ -25,21 +26,20 @@ def bump_version(version: str, bump_type: str) -> str: def main(): """Main function for incrementing the version.""" bump_type = sys.argv[1] if len(sys.argv) > 1 else "patch" - path = "pyproject.toml" + toml_path = Path("pyproject.toml") try: - with open(path, 'r') as f: - content = f.read() + content = toml_path.read_text() except FileNotFoundError: - print(f"::error:: File not found: {path}") + print(f"::error:: File not found: {toml_path}") sys.exit(1) except IOError as e: - print(f"::error:: Error reading file {path}: {e}") + print(f"::error:: Error reading file {toml_path}: {e}") sys.exit(1) match = re.search(r'version\s*=\s*"(\d+)\.(\d+)\.(\d+)"', content) if not match: - print(f"::error:: Version not found in {path}") + print(f"::error:: Version not found in {toml_path}") sys.exit(1) current_version = f"{match.group(1)}.{match.group(2)}.{match.group(3)}" @@ -54,16 +54,16 @@ def main(): ) try: - with open(path, "w") as f: - f.write(new_content) + toml_path.write_text(new_content) except IOError as e: - print(f"::error:: Error writing to file {path}: {e}") + print(f"::error:: Error writing to file {toml_path}: {e}") sys.exit(1) - github_output_path = os.environ.get("GITHUB_OUTPUT") - if github_output_path: + github_output_path_str = os.environ.get("GITHUB_OUTPUT") + if github_output_path_str: + github_output_path = Path(github_output_path_str) try: - with open(github_output_path, "a") as gh_out: + with github_output_path.open("a") as gh_out: gh_out.write(f"new_version={new_version}\n") except IOError as e: print(f"::warning:: Error writing to GITHUB_OUTPUT ({github_output_path}): {e}") diff --git a/concall_parser/agents/check_moderator.py b/concall_parser/agents/check_moderator.py index 04c1f11..b19ba26 100644 --- a/concall_parser/agents/check_moderator.py +++ b/concall_parser/agents/check_moderator.py @@ -81,10 +81,13 @@ def process(page_text: str, groq_model: str) -> str: {"role": "system", "content": CONTEXT}, {"role": "user", "content": page_text}, ] + response = '{"moderator": ""}' # Initialize with a default value to prevent UnboundLocalError try: response = get_groq_response(messages=messages, model=groq_model) except Exception: logger.exception( "Could not get groq response for management extraction" ) + # If an exception occurs, the initialized default response will be returned. + # This ensures the function always returns a string as per its type hint. return response diff --git a/concall_parser/agents/classify.py b/concall_parser/agents/classify.py index 8524325..f4bc8a6 100644 --- a/concall_parser/agents/classify.py +++ b/concall_parser/agents/classify.py @@ -62,7 +62,7 @@ class ClassifyModeratorIntent: """Classify moderator statements into categories.""" @staticmethod - def process(dialogue: str, groq_model: str): + def process(dialogue: str, groq_model: str) -> str: """Classify a moderator statement into one of the three categories. Args: diff --git a/concall_parser/agents/extraction.py b/concall_parser/agents/extraction.py index 0e69849..55a4c96 100644 --- a/concall_parser/agents/extraction.py +++ b/concall_parser/agents/extraction.py @@ -1,5 +1,6 @@ from concall_parser.log_config import logger from concall_parser.utils.get_groq_responses import get_groq_response +import json # TODO: add second prompt case, for apollo (may be solved using regex but idk) @@ -87,7 +88,7 @@ class ExtractManagement: """Class to extract management information from a PDF document.""" @staticmethod - def process(page_text: str, groq_model: str) -> str: + def process(page_text: str, groq_model: str) -> dict: """Process the given page text to extract relevant management information. Args: @@ -96,31 +97,38 @@ def process(page_text: str, groq_model: str) -> str: groq_model (str): The model to use for Groq queries. Returns: - None + dict: A dictionary containing extracted management information, + or an empty dictionary if no information is found or an error occurs. """ # TODO: context selection logic is wrong, recheck. # The current logic switches context if page_text is empty, which is likely not # the intended behavior for SPEAKER_SELECTION_CONTEXT. An empty page_text # should probably result in an empty response or an error. - if page_text: # Pythonic way to check for non-empty string - messages = [ - {"role": "system", "content": CONTEXT}, - {"role": "user", "content": page_text}, - ] - else: - # This branch is reached if page_text is empty. - # Using SPEAKER_SELECTION_CONTEXT with an empty user message is likely incorrect. - # Consider returning an empty dict or raising an error here. + if not page_text: # More explicit check for empty string logger.warning("Received empty page_text for extraction. Returning empty response.") - return "{}" # Returning an empty JSON string as per "If no management information is found, return an empty dict: {}." + return {} # Return empty dict directly + + # The current implementation always uses CONTEXT. The TODOs indicate a missing + # context selection logic for SPEAKER_SELECTION_CONTEXT. + messages = [ + {"role": "system", "content": CONTEXT}, + {"role": "user", "content": page_text}, + ] # TODO: update data model of response in case of speaker selection # TODO: add company name fix in case of speaker selection try: - response = get_groq_response(messages=messages, model=groq_model) - return response + response_str = get_groq_response(messages=messages, model=groq_model) + # Attempt to parse the response as JSON for robustness and consistency + parsed_response = json.loads(response_str) + return parsed_response + except json.JSONDecodeError: + logger.exception( + "Groq response for management extraction was not valid JSON." + ) + return {} except Exception: logger.exception( "Could not get groq response for management extraction" ) - return "{}" # Ensure a consistent return type even on error + return {} # Ensure a consistent return type even on error diff --git a/concall_parser/base_parser.py b/concall_parser/base_parser.py index 7be4e7f..7c43006 100644 --- a/concall_parser/base_parser.py +++ b/concall_parser/base_parser.py @@ -7,4 +7,4 @@ class BaseExtractor(ABC): @abstractmethod def extract(self, *args, **kwargs): """Extracts data from the input.""" - pass + pass \ No newline at end of file diff --git a/concall_parser/extractors/dialogue_extractor.py b/concall_parser/extractors/dialogue_extractor.py index ee1d44a..77f07b8 100644 --- a/concall_parser/extractors/dialogue_extractor.py +++ b/concall_parser/extractors/dialogue_extractor.py @@ -36,9 +36,19 @@ def _handle_leftover_text( cleaned = clean_text(leftover_text) if current_analyst: - self.dialogues["analyst_discussion"][current_analyst]["dialogue"][ - -1 - ]["dialogue"] += f" {cleaned}" + analyst_dialogues = self.dialogues["analyst_discussion"][current_analyst][ + "dialogue" + ] + if analyst_dialogues: + analyst_dialogues[-1]["dialogue"] += f" {cleaned}" + else: + # If this is the first dialogue for the analyst, treat leftover as their initial statement. + analyst_dialogues.append( + { + "speaker": current_analyst, # Assuming the analyst is speaking + "dialogue": cleaned, + } + ) elif self.dialogues["commentary_and_future_outlook"]: self.dialogues["commentary_and_future_outlook"][-1]["dialogue"] += ( f" {cleaned}" @@ -76,29 +86,28 @@ def _append_dialogue( } ) - def _process_match( - self, match, groq_model: str, current_analyst: str | None - ): - speaker = match.group("speaker").strip() - dialogue = match.group("dialogue") - intent = None - - if speaker == "Moderator": - response = json.loads( - ClassifyModeratorIntent.process( - dialogue=dialogue, groq_model=groq_model - ) + def _process_moderator_dialogue( + self, dialogue: str, groq_model: str + ) -> tuple[str, str | None]: + """Processes moderator dialogue to classify intent and extract analyst info. + Updates self.dialogues directly for new analyst discussions. + """ + response = json.loads( + ClassifyModeratorIntent.process( + dialogue=dialogue, groq_model=groq_model ) - intent = response["intent"] - if intent == "new_analyst_start": - current_analyst = response["analyst_name"] - self.dialogues["analyst_discussion"][current_analyst] = { - "analyst_company": response["analyst_company"], - "dialogue": [], - } - return intent, current_analyst, None # Moderator handled + ) + intent = response["intent"] + current_analyst = None - return intent, current_analyst, (speaker, dialogue) + if intent == "new_analyst_start": + current_analyst = response["analyst_name"] + analyst_company = response["analyst_company"] + self.dialogues["analyst_discussion"][current_analyst] = { + "analyst_company": analyst_company, + "dialogue": [], + } + return intent, current_analyst def extract_commentary_and_future_outlook( self, transcript: dict[int, str], groq_model: str @@ -126,15 +135,12 @@ def extract_commentary_and_future_outlook( for match in self.speaker_pattern.finditer(text): speaker = match.group("speaker").strip() last_speaker = speaker + dialogue_content = match.group("dialogue") if speaker == "Moderator": - response = json.loads( - ClassifyModeratorIntent.process( - dialogue=match.group("dialogue"), - groq_model=groq_model, - ) + intent, current_analyst = self._process_moderator_dialogue( + dialogue_content, groq_model ) - intent = response["intent"] if intent == "new_analyst_start": return self.dialogues["commentary_and_future_outlook"] continue @@ -142,7 +148,7 @@ def extract_commentary_and_future_outlook( if intent == "opening": self._append_dialogue( speaker, - match.group("dialogue"), + dialogue_content, intent, current_analyst, ) @@ -178,30 +184,19 @@ def extract_dialogues( for match in self.speaker_pattern.finditer(text): speaker = match.group("speaker").strip() last_speaker = speaker + dialogue_content = match.group("dialogue") if speaker == "Moderator": - response = json.loads( - ClassifyModeratorIntent.process( - dialogue=match.group("dialogue"), - groq_model=groq_model, - ) + intent, current_analyst = self._process_moderator_dialogue( + dialogue_content, groq_model ) - intent = response["intent"] - if intent == "new_analyst_start": - current_analyst = response["analyst_name"] - self.dialogues["analyst_discussion"][ - current_analyst - ] = { - "analyst_company": response["analyst_company"], - "dialogue": [], - } continue if intent is None: break self._append_dialogue( - speaker, match.group("dialogue"), intent, current_analyst + speaker, dialogue_content, intent, current_analyst ) return self.dialogues diff --git a/concall_parser/extractors/management_case_extractor.py b/concall_parser/extractors/management_case_extractor.py index faf1a73..0a9d026 100644 --- a/concall_parser/extractors/management_case_extractor.py +++ b/concall_parser/extractors/management_case_extractor.py @@ -5,6 +5,24 @@ class ManagementCaseExtractor: """Handles case where moderator is not present.""" + + # Pre-compile the regex for performance and readability + SPEAKER_SPEECH_PATTERN = re.compile( + r""" + ([A-Z]\.\s)? # Optional initial (e.g., "J. "). Group 1. + ([A-Za-z\s]+) # Speaker name (e.g., "John Doe"). Group 2. + :\s # Colon and space separator. + (.*?) # Non-greedy match for the speech content. Group 3. + (?= # Positive lookahead for either: + \s[A-Z]\.?\s? # Another speaker pattern (space, initial, optional dot, optional space, + [A-Za-z\s]+:\s # name, colon, space). + | # OR + $ # End of the string. + ) + """, + re.DOTALL | re.VERBOSE, + ) + def extract(self, transcript: dict[str, str]): """Extracts speaker names and their corresponding speeches from the transcript. @@ -21,11 +39,7 @@ def extract(self, transcript: dict[str, str]): speech_pair: dict[str, list[str]] = {} for _, text in transcript.items(): - matches = re.findall( - r"([A-Z]\.\s)?([A-Za-z\s]+):\s(.*?)(?=\s[A-Z]\.?\s?[A-Za-z\s]+:\s|$)", - text, - re.DOTALL, - ) + matches = self.SPEAKER_SPEECH_PATTERN.findall(text) for initial, name, speech in matches: speaker = ( diff --git a/concall_parser/log_config.py b/concall_parser/log_config.py index 56817da..1d6a2eb 100644 --- a/concall_parser/log_config.py +++ b/concall_parser/log_config.py @@ -29,4 +29,4 @@ def configure_logger( if save_to_file: file_handler = logging.FileHandler(log_file) file_handler.setFormatter(formatter) - logger.addHandler(file_handler) \ No newline at end of file + logger.addHandler(file_handler) diff --git a/concall_parser/parser.py b/concall_parser/parser.py index af4e518..a3d6319 100644 --- a/concall_parser/parser.py +++ b/concall_parser/parser.py @@ -70,17 +70,14 @@ def _get_document_transcript(self, filepath: str, link: str) -> dict[int, str]: ) if link: - self.transcript = get_transcript_from_link(link=link) + transcript = get_transcript_from_link(link=link) else: - self.transcript = get_document_transcript(filepath=filepath) - return self.transcript + transcript = get_document_transcript(filepath=filepath) + return transcript def extract_concall_info(self) -> dict: """Extracts company name and management team from the transcript. - Args: - None - Returns: dict: Company name and management team as a dictionary. """ diff --git a/concall_parser/utils/file_utils.py b/concall_parser/utils/file_utils.py index deb7c0d..c0d201b 100644 --- a/concall_parser/utils/file_utils.py +++ b/concall_parser/utils/file_utils.py @@ -1,5 +1,4 @@ import json -import os import tempfile from pathlib import Path @@ -123,7 +122,10 @@ def get_transcript_from_link(link:str) -> dict[int, str]: logger.debug("Request to get transcript from link: %s", link) headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa: E501 + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ) } # Use a higher timeout for potentially large PDF downloads response = requests.get(url=link, headers=headers, timeout=60, stream=True) @@ -152,7 +154,7 @@ def get_transcript_from_link(link:str) -> dict[int, str]: # Ensure the temporary file is cleaned up, even if errors occur if temp_doc_path and temp_doc_path.exists(): # Check if path was assigned and exists try: - os.remove(temp_doc_path) + temp_doc_path.unlink() logger.debug("Cleaned up temporary file: %s", temp_doc_path) except OSError as e: logger.warning("Could not remove temporary file %s: %s", temp_doc_path, e) diff --git a/concall_parser/utils/get_groq_responses.py b/concall_parser/utils/get_groq_responses.py index a975daa..fcaff6e 100644 --- a/concall_parser/utils/get_groq_responses.py +++ b/concall_parser/utils/get_groq_responses.py @@ -1,5 +1,3 @@ -from typing import List, Dict, Any - from groq import APIStatusError, Groq from concall_parser.config import get_groq_api_key @@ -8,7 +6,7 @@ client = Groq(api_key=get_groq_api_key()) -def get_groq_response(messages: List[Dict[str, str]], model: str) -> str | None: +def get_groq_response(messages: list[dict[str, str]], model: str) -> str | None: """Get response from Groq API.""" try: response = client.chat.completions.create( diff --git a/dev-requirements.txt b/dev-requirements.txt index 5b43976..223b0f2 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,7 +1,7 @@ pre-commit==4.5.0 -pytest==9.0.1 +pytest==9.0.2 pytest-regressions==2.8.3 ruff==0.14.8 python-dotenv==1.2.1 -groq==0.37.0 +groq==0.37.1 requests==2.32.5 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index c5b12fe..7a252d0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ packages = [ [tool.poetry.dependencies] python = "^3.12" -groq = "0.37.0" +groq-sdk = "0.9.0" # Updated from 'groq = "0.37.0"' to 'groq-sdk = "0.9.0"' as 'groq-sdk' is the official PyPI package for Groq's Python SDK. pdfplumber = "0.11.8" python-dotenv = "1.2.1" requests = "2.32.5" diff --git a/tests/test_against_old.py b/tests/test_against_old.py index 1b54025..e0298ed 100644 --- a/tests/test_against_old.py +++ b/tests/test_against_old.py @@ -1,5 +1,4 @@ import json -import os import pathlib import pytest diff --git a/tests/test_breaking_changes.py b/tests/test_breaking_changes.py index 4d372f0..68f0a26 100644 --- a/tests/test_breaking_changes.py +++ b/tests/test_breaking_changes.py @@ -1,5 +1,4 @@ import filecmp -from typing import List from pathlib import Path from tests.test_parsing import process_single_file @@ -27,7 +26,7 @@ def test_single_file_processing(filepath: Path, output_dir: Path, expected_outpu print("Test passed") -def test_multiple_files_processing(input_files: List[Path], output_dir: Path, expected_output_dirs: List[Path]) -> None: +def test_multiple_files_processing(input_files: list[Path], output_dir: Path, expected_output_dirs: list[Path]) -> None: """Test processing multiple files and compare the outputs with the expected outputs.""" for input_file, expected_output_dir in zip(input_files, expected_output_dirs): test_single_file_processing(input_file, output_dir, expected_output_dir) diff --git a/tests/test_parsing.py b/tests/test_parsing.py index d1a6690..168c9a7 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -68,26 +68,27 @@ def process_batch(test_dir_path: str, test_all: bool = False): # files_to_process -= successful_files # Sort files for consistent processing order, useful for debugging and reproducibility - files_to_process_sorted = sorted(list(files_to_process)) + files_to_process_sorted = sorted(files_to_process) # Use 'with' statements for log files to ensure they are properly closed - with open(FAILED_FILES_LOG, "w", encoding="utf-8") as failed_log, \ - open(SUCCESS_FILES_LOG, "w", encoding="utf-8") as successful_log: - + with ( + open(FAILED_FILES_LOG, "w", encoding="utf-8") as failed_log, + open(SUCCESS_FILES_LOG, "w", encoding="utf-8") as successful_log, + ): for path in files_to_process_sorted: filepath = os.path.join(test_dir_path, path) # Double-check if the path points to an actual file if not os.path.isfile(filepath): logger.warning(f"Skipping non-file entry: {filepath}") - failed_log.write(path + "\n") # Log non-files as failed to prevent re-attempting + failed_log.write(path + "\n") # Log non-files as failed to prevent re-attempting continue try: logger.info(f"Testing {path}") process_single_file(filepath, path) successful_log.write(path + "\n") - except Exception: # Catching bare Exception is generally discouraged but kept for original intent + except Exception: # Catching bare Exception is generally discouraged but kept for original intent failed_log.write(path + "\n") logger.exception(f"Error while processing file {path}") continue