From 91289e5683ed221ea00467629cd159d20ac43438 Mon Sep 17 00:00:00 2001 From: Jay Shah <61103636+JS12540@users.noreply.github.com> Date: Sun, 7 Dec 2025 01:09:48 +0530 Subject: [PATCH] refactor: AI Migration for 15 files Automated changes by CodeEvolve. --- .github/scripts/update_version.py | 34 +++++-- concall_parser/agents/classify.py | 18 ++-- concall_parser/agents/extraction.py | 19 ++-- concall_parser/agents/verify_speakers.py | 2 +- concall_parser/extractors/management.py | 5 +- .../extractors/management_case_extractor.py | 8 +- concall_parser/parser.py | 4 +- concall_parser/utils/file_utils.py | 98 +++++++++++++------ concall_parser/utils/get_groq_responses.py | 4 +- dev-requirements.txt | 14 +-- pyproject.toml | 16 +-- tests/test_against_old.py | 3 +- tests/test_breaking_changes.py | 17 ++-- tests/test_only_management_case.py | 2 +- tests/test_parsing.py | 72 ++++++++------ 15 files changed, 202 insertions(+), 114 deletions(-) diff --git a/.github/scripts/update_version.py b/.github/scripts/update_version.py index 4d7830f..89f7fd9 100644 --- a/.github/scripts/update_version.py +++ b/.github/scripts/update_version.py @@ -27,13 +27,20 @@ def main(): bump_type = sys.argv[1] if len(sys.argv) > 1 else "patch" path = "pyproject.toml" - with open(path) as f: - content = f.read() + try: + with open(path, 'r') as f: + content = f.read() + except FileNotFoundError: + print(f"::error:: File not found: {path}") + sys.exit(1) + except IOError as e: + print(f"::error:: Error reading file {path}: {e}") + sys.exit(1) match = re.search(r'version\s*=\s*"(\d+)\.(\d+)\.(\d+)"', content) if not match: - print("Version not found in pyproject.toml") - return + print(f"::error:: Version not found in {path}") + sys.exit(1) current_version = f"{match.group(1)}.{match.group(2)}.{match.group(3)}" new_version = bump_version(current_version, bump_type) @@ -46,11 +53,22 @@ def main(): content, ) - with open(path, "w") as f: - f.write(new_content) + try: + with open(path, "w") as f: + f.write(new_content) + except IOError as e: + print(f"::error:: Error writing to file {path}: {e}") + sys.exit(1) - with open(os.environ["GITHUB_OUTPUT"], "a") as gh_out: - gh_out.write(f"new_version={new_version}\n") + github_output_path = os.environ.get("GITHUB_OUTPUT") + if github_output_path: + try: + with open(github_output_path, "a") as gh_out: + gh_out.write(f"new_version={new_version}\n") + except IOError as e: + print(f"::warning:: Error writing to GITHUB_OUTPUT ({github_output_path}): {e}") + else: + print("::warning:: GITHUB_OUTPUT environment variable not set. Cannot output new_version.") if __name__ == "__main__": diff --git a/concall_parser/agents/classify.py b/concall_parser/agents/classify.py index 0976ed3..8524325 100644 --- a/concall_parser/agents/classify.py +++ b/concall_parser/agents/classify.py @@ -13,16 +13,16 @@ Response should be in json format for opening and end, like this: { - "intent": "opening" - "reasoning": Provide a reasoning for the intent + "intent": "opening", + "reasoning": "Provide a reasoning for the intent" } If it's new_analyst_start, response should be in json format like this: { "intent": "new_analyst_start", "analyst_name":"analyst_name present in the moderator statement", - "analyst_company:""analyst_company present in the moderator statement" - "reasoning": Provide a reasoning for the intent + "analyst_company": "analyst_company present in the moderator statement", + "reasoning": "Provide a reasoning for the intent" } EXAMPLES: @@ -32,7 +32,7 @@ Response: { - "intent": "opening" + "intent": "opening", "reasoning": "From the moderator statement, it's the start of the call, as the moderator is welcoming everyone to the concall." } @@ -42,9 +42,9 @@ Response: { - "intent": "new_analyst_start" - "analyst_name": "Mukesh Saraf" - "analyst_company": "Avendus Spark" + "intent": "new_analyst_start", + "analyst_name": "Mukesh Saraf", + "analyst_company": "Avendus Spark", "reasoning": "From the moderator statement, it's introducing an analyst from a new company to start the Q&A session." } @@ -52,7 +52,7 @@ Response: { - "intent": "end" + "intent": "end", "reasoning": "From the moderator statement, it's closing the call." } """ # noqa diff --git a/concall_parser/agents/extraction.py b/concall_parser/agents/extraction.py index 16c0de2..0e69849 100644 --- a/concall_parser/agents/extraction.py +++ b/concall_parser/agents/extraction.py @@ -64,7 +64,7 @@ Kunal Dhamesha Disclaimer Currently, 34 wells have been put on stream -\u2013 Managing Director and Chief Executive Officer, Siemens Limited - Thank you very much and all the best and a very happy year ahead. +– Managing Director and Chief Executive Officer, Siemens Limited - Thank you very much and all the best and a very happy year ahead. Output: @@ -98,17 +98,21 @@ def process(page_text: str, groq_model: str) -> str: Returns: None """ - # TODO: context selection logic is wrong, recheck - if page_text != "": + # TODO: context selection logic is wrong, recheck. + # The current logic switches context if page_text is empty, which is likely not + # the intended behavior for SPEAKER_SELECTION_CONTEXT. An empty page_text + # should probably result in an empty response or an error. + if page_text: # Pythonic way to check for non-empty string messages = [ {"role": "system", "content": CONTEXT}, {"role": "user", "content": page_text}, ] else: - messages = [ - {"role": "system", "content": SPEAKER_SELECTION_CONTEXT}, - {"role": "user", "content": page_text}, - ] + # This branch is reached if page_text is empty. + # Using SPEAKER_SELECTION_CONTEXT with an empty user message is likely incorrect. + # Consider returning an empty dict or raising an error here. + logger.warning("Received empty page_text for extraction. Returning empty response.") + return "{}" # Returning an empty JSON string as per "If no management information is found, return an empty dict: {}." # TODO: update data model of response in case of speaker selection # TODO: add company name fix in case of speaker selection @@ -119,3 +123,4 @@ def process(page_text: str, groq_model: str) -> str: logger.exception( "Could not get groq response for management extraction" ) + return "{}" # Ensure a consistent return type even on error diff --git a/concall_parser/agents/verify_speakers.py b/concall_parser/agents/verify_speakers.py index 3254457..01ee653 100644 --- a/concall_parser/agents/verify_speakers.py +++ b/concall_parser/agents/verify_speakers.py @@ -80,7 +80,7 @@ class VerifySpeakerNames: """Finds actual names from extracted speaker pattern.""" @staticmethod - def process(speakers: str, groq_model: str): + def process(speakers: str, groq_model: str) -> str: """Returns the actual names out of all the speaker pattern matches provided. Args: diff --git a/concall_parser/extractors/management.py b/concall_parser/extractors/management.py index 79f9589..57b4214 100644 --- a/concall_parser/extractors/management.py +++ b/concall_parser/extractors/management.py @@ -15,6 +15,9 @@ def extract(self, text: str, groq_model: str) -> dict: page_text=text, groq_model=groq_model ) return json.loads(response) + except json.JSONDecodeError: + logger.exception("Failed to decode JSON response from management extraction.") + return {} except Exception: - logger.exception("Failed to extract management team.") + logger.exception("An unexpected error occurred during management extraction.") return {} diff --git a/concall_parser/extractors/management_case_extractor.py b/concall_parser/extractors/management_case_extractor.py index 5d042c2..faf1a73 100644 --- a/concall_parser/extractors/management_case_extractor.py +++ b/concall_parser/extractors/management_case_extractor.py @@ -5,7 +5,7 @@ class ManagementCaseExtractor: """Handles case where moderator is not present.""" - def extract(self, transcript:dict[int,str]): + def extract(self, transcript: dict[str, str]): """Extracts speaker names and their corresponding speeches from the transcript. To be used when moderator is not present in transcript. @@ -29,8 +29,8 @@ def extract(self, transcript:dict[int,str]): for initial, name, speech in matches: speaker = ( - f"{(initial or '').strip()} {name.strip()}".strip() - ) # Clean speaker name + f"{(initial or '').strip()} {name.strip()}" + ).strip() # Clean speaker name speech = re.sub(r"\n", " ", speech).strip() # Clean speech text if speaker not in all_speakers: @@ -40,4 +40,4 @@ def extract(self, transcript:dict[int,str]): speech_pair[speaker].append(speech) logger.debug(f"Extracted Speakers: {all_speakers}") - return speech_pair \ No newline at end of file + return speech_pair diff --git a/concall_parser/parser.py b/concall_parser/parser.py index b1d2136..af4e518 100644 --- a/concall_parser/parser.py +++ b/concall_parser/parser.py @@ -62,10 +62,10 @@ def _get_document_transcript(self, filepath: str, link: str) -> dict[int, str]: transcript: Dictionary of page number, page text pair. Raises: - Exception in case neither of filepath or link are provided. + ValueError: In case neither of filepath or link are provided. """ if not (filepath or link): - raise Exception( + raise ValueError( "Concall source cannot be empty. Provide filepath or link to concall." ) diff --git a/concall_parser/utils/file_utils.py b/concall_parser/utils/file_utils.py index de70096..deb7c0d 100644 --- a/concall_parser/utils/file_utils.py +++ b/concall_parser/utils/file_utils.py @@ -1,5 +1,7 @@ import json import os +import tempfile +from pathlib import Path import pdfplumber import requests @@ -19,7 +21,7 @@ def get_document_transcript(filepath: str) -> dict[int, str]: transcript = {} try: with pdfplumber.open(filepath) as pdf: - logger.debug("Loaded document") + logger.debug("Loaded document: %s", filepath) page_number = 1 for page in pdf.pages: text = page.extract_text() @@ -28,9 +30,14 @@ def get_document_transcript(filepath: str) -> dict[int, str]: page_number += 1 return transcript except FileNotFoundError: - raise FileNotFoundError("Please check if file exists.") - except Exception: - logger.exception("Could not load file %s", filepath) + logger.error("File not found: %s", filepath) + raise FileNotFoundError(f"Please check if file exists: {filepath}") + except (pdfplumber.PDFSyntaxError, pdfplumber.PDFDataError) as e: + logger.exception("Error parsing PDF file %s: %s", filepath, e) + raise ValueError(f"Error parsing PDF file: {filepath}") from e + except Exception as e: + logger.exception("Could not load file %s: %s", filepath, e) + raise # Re-raise the exception after logging def save_output( @@ -46,15 +53,22 @@ def save_output( output_base_path (str): Path to directory in which outputs are to be saved. document_name (str): Name of the file being parsed, corresponds to company name for now. """ + # Use pathlib for robust path handling + output_base_path_obj = Path(output_base_path) + document_stem = Path(document_name).stem # Get filename without extension + + output_dir_path_obj = output_base_path_obj / document_stem + output_dir_path_obj.mkdir(parents=True, exist_ok=True) + for dialogue_type, dialogue in dialogues.items(): - output_dir_path = os.path.join( - output_base_path, os.path.basename(document_name)[:-4] - ) - os.makedirs(output_dir_path, exist_ok=True) - with open( - os.path.join(output_dir_path, f"{dialogue_type}.json"), "w" - ) as file: - json.dump(dialogue, file, indent=4) + output_file_path = output_dir_path_obj / f"{dialogue_type}.json" + try: + with open(output_file_path, "w", encoding="utf-8") as file: + json.dump(dialogue, file, indent=4) + logger.debug("Saved %s to %s", dialogue_type, output_file_path) + except OSError as e: + logger.exception("Could not save dialogue type %s to %s: %s", dialogue_type, output_file_path, e) + raise # Re-raise after logging def save_transcript( @@ -72,16 +86,23 @@ def save_transcript( output_base_path (str): Path of directory where transcripts are to be saved. """ try: - document_name = os.path.basename(document_path)[:-4] # remove the .pdf - output_dir_path = os.path.join(output_base_path, document_name) - os.makedirs(output_base_path, exist_ok=True) - with open(f"{output_dir_path}.txt", "w") as file: + output_base_path_obj = Path(output_base_path) + output_base_path_obj.mkdir(parents=True, exist_ok=True) + + document_name_stem = Path(document_path).stem # Get filename without extension + output_file_path = output_base_path_obj / f"{document_name_stem}.txt" + + with open(output_file_path, "w", encoding="utf-8") as file: for _, text in transcript.items(): file.write(text) file.write("\n\n") - logger.info("Saved transcript text to file\n") - except Exception: - logger.exception("Could not save document transcript") + logger.info("Saved transcript text to file: %s", output_file_path) + except OSError as e: + logger.exception("Could not save document transcript to %s: %s", output_file_path, e) + raise # Re-raise after logging + except Exception as e: # Catch any other unexpected errors + logger.exception("An unexpected error occurred while saving transcript: %s", e) + raise def get_transcript_from_link(link:str) -> dict[int, str]: @@ -96,23 +117,42 @@ def get_transcript_from_link(link:str) -> dict[int, str]: Raises: Http error, if encountered during downloading document. """ + transcript = dict() + temp_doc_path = None # Initialize to None for finally block try: - logger.debug("Request to get transcript from link.") + logger.debug("Request to get transcript from link: %s", link) headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"# noqa: E501 + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa: E501 } - response = requests.get(url=link, headers=headers, timeout=30, stream=True) + # Use a higher timeout for potentially large PDF downloads + response = requests.get(url=link, headers=headers, timeout=60, stream=True) response.raise_for_status() - temp_doc_path = "temp_document.pdf" - with open(temp_doc_path, 'wb') as temp_pdf: + # Use tempfile for secure and automatic cleanup of temporary files + with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf: + temp_doc_path = Path(temp_pdf.name) for chunk in response.iter_content(chunk_size=8192): temp_pdf.write(chunk) - transcript = get_document_transcript(filepath=temp_doc_path) - os.remove(temp_doc_path) + logger.debug("Downloaded PDF to temporary file: %s", temp_doc_path) + transcript = get_document_transcript(filepath=str(temp_doc_path)) return transcript - except Exception: - logger.exception("Could not get transcript from link") - return dict() \ No newline at end of file + except requests.exceptions.RequestException as e: + logger.exception("HTTP/Network error while getting transcript from link %s: %s", link, e) + # Optionally re-raise a more specific custom exception if needed by calling code + raise ConnectionError(f"Failed to download PDF from {link}") from e + except (OSError, ValueError) as e: # Catch errors from file operations or PDF parsing + logger.exception("File/PDF processing error after downloading from link %s: %s", link, e) + raise + except Exception as e: + logger.exception("An unexpected error occurred while getting transcript from link %s: %s", link, e) + raise + finally: + # Ensure the temporary file is cleaned up, even if errors occur + if temp_doc_path and temp_doc_path.exists(): # Check if path was assigned and exists + try: + os.remove(temp_doc_path) + logger.debug("Cleaned up temporary file: %s", temp_doc_path) + except OSError as e: + logger.warning("Could not remove temporary file %s: %s", temp_doc_path, e) diff --git a/concall_parser/utils/get_groq_responses.py b/concall_parser/utils/get_groq_responses.py index 7bd6a9f..a975daa 100644 --- a/concall_parser/utils/get_groq_responses.py +++ b/concall_parser/utils/get_groq_responses.py @@ -1,3 +1,5 @@ +from typing import List, Dict, Any + from groq import APIStatusError, Groq from concall_parser.config import get_groq_api_key @@ -6,7 +8,7 @@ client = Groq(api_key=get_groq_api_key()) -def get_groq_response(messages, model): +def get_groq_response(messages: List[Dict[str, str]], model: str) -> str | None: """Get response from Groq API.""" try: response = client.chat.completions.create( diff --git a/dev-requirements.txt b/dev-requirements.txt index f32d9dd..5b43976 100644 --- a/dev-requirements.txt +++ b/dev-requirements.txt @@ -1,7 +1,7 @@ -pre-commit==3.7.0 -pytest==8.3.5 -pytest-regressions==2.7.0 -ruff==0.4.1 -python-dotenv==1.1.0 -groq==0.22.0 -requests==2.32.2 +pre-commit==4.5.0 +pytest==9.0.1 +pytest-regressions==2.8.3 +ruff==0.14.8 +python-dotenv==1.2.1 +groq==0.37.0 +requests==2.32.5 \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index d254bc5..4b68d02 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,15 +15,15 @@ packages = [ ] [tool.poetry.dependencies] -python = "^3.10" -groq = "0.22.0" -pdfplumber = "0.11.5" -python-dotenv = "1.1.0" -requests = "2.32.2" +python = "^3.12" +groq = "0.37.0" +pdfplumber = "0.11.8" +python-dotenv = "1.2.1" +requests = "2.32.5" [tool.poetry.group.dev.dependencies] -ruff = "0.4.1" -pre-commit = "3.7.0" +ruff = "0.14.8" +pre-commit = "4.5.0" [build-system] requires = ["poetry-core"] @@ -32,7 +32,7 @@ build-backend = "poetry.core.masonry.api" [tool.ruff] line-length = 80 indent-width = 4 -target-version = "py310" +target-version = "py312" extend-exclude = [ "__init__.py", "migrations", diff --git a/tests/test_against_old.py b/tests/test_against_old.py index bab1b34..1b54025 100644 --- a/tests/test_against_old.py +++ b/tests/test_against_old.py @@ -1,5 +1,6 @@ import json import os +import pathlib import pytest @@ -10,7 +11,7 @@ @pytest.mark.parametrize("pdf_file", [ - os.path.join(PDF_DIR, f) for f in os.listdir(PDF_DIR) if f.endswith(".pdf") + str(p) for p in pathlib.Path(PDF_DIR).glob("*.pdf") ]) def test_pdf_parser_regression(pdf_file, data_regression): """Test against saved working version of output.""" diff --git a/tests/test_breaking_changes.py b/tests/test_breaking_changes.py index 525a7dd..4d372f0 100644 --- a/tests/test_breaking_changes.py +++ b/tests/test_breaking_changes.py @@ -1,9 +1,11 @@ import filecmp +from typing import List +from pathlib import Path from tests.test_parsing import process_single_file -def compare_folders(output, expected): +def compare_folders(output: Path, expected: Path) -> bool: """Compare the contents of two output dirs - verify older passed tests.""" comparison = filecmp.dircmp(output, expected) if comparison.left_only or comparison.right_only or comparison.diff_files: @@ -11,10 +13,11 @@ def compare_folders(output, expected): return True -def test_single_file_processing(filepath, output_dir, expected_output_dir): +def test_single_file_processing(filepath: Path, output_dir: Path, expected_output_dir: Path) -> None: """Test processing a single file and compare the output with the expected output.""" try: - process_single_file(filepath, output_dir) + # Assuming process_single_file expects string paths for external compatibility + process_single_file(str(filepath), str(output_dir)) assert compare_folders( output_dir, expected_output_dir ), "Output does not match expected" @@ -24,7 +27,7 @@ def test_single_file_processing(filepath, output_dir, expected_output_dir): print("Test passed") -def test_multiple_files_processing(input_files, output_dir, expected_output_dirs): +def test_multiple_files_processing(input_files: List[Path], output_dir: Path, expected_output_dirs: List[Path]) -> None: """Test processing multiple files and compare the outputs with the expected outputs.""" for input_file, expected_output_dir in zip(input_files, expected_output_dirs): test_single_file_processing(input_file, output_dir, expected_output_dir) @@ -33,7 +36,7 @@ def test_multiple_files_processing(input_files, output_dir, expected_output_dirs # TODO: upgrade to pytest-regressions if __name__ == "__main__": test_single_file_processing( - filepath="test_documents/ambuja_cement.pdf", - output_dir="output/ambuja_cement", - expected_output_dir="tests/parsed_correct/ambuja_cement", + filepath=Path("test_documents/ambuja_cement.pdf"), + output_dir=Path("output/ambuja_cement"), + expected_output_dir=Path("tests/parsed_correct/ambuja_cement"), ) diff --git a/tests/test_only_management_case.py b/tests/test_only_management_case.py index 8257a42..d47d959 100644 --- a/tests/test_only_management_case.py +++ b/tests/test_only_management_case.py @@ -11,7 +11,7 @@ def test_handle_only_management_case(filepath: str) -> None: Args: filepath: Path to the PDF file containing the transcript. """ - transcript: dict[int, str] = {} + transcript: dict[str, str] = {} with pdfplumber.open(filepath) as pdf: for page in pdf.pages: diff --git a/tests/test_parsing.py b/tests/test_parsing.py index 2247142..d1a6690 100644 --- a/tests/test_parsing.py +++ b/tests/test_parsing.py @@ -23,7 +23,7 @@ class TestChoices(Enum): def process_single_file(filepath: str, output_path: str): """Run a single file and save its output and log.""" - logger.debug("Starting testing for %s", filepath) + logger.debug(f"Starting testing for {filepath}") transcript = get_document_transcript(filepath) save_transcript(transcript, output_path, "raw_transcript") @@ -43,38 +43,54 @@ def process_batch(test_dir_path: str, test_all: bool = False): test_all (bool): Flag to toggle testing all documents or only those that failed last test. """ + error_files = set() if os.path.exists(FAILED_FILES_LOG): - with open(FAILED_FILES_LOG) as file: - error_files = file.readlines() - # TODO: make standard testing methods - # if os.path.exists(SUCCESS_FILES_LOG): - # with open(SUCCESS_FILES_LOG) as file: - # success_files = file.readlines() - - failed = open(FAILED_FILES_LOG, "w") - successful = open(SUCCESS_FILES_LOG, "w") + # Use 'with' statement and specify encoding for robustness + with open(FAILED_FILES_LOG, "r", encoding="utf-8") as file: + error_files = {line.strip() for line in file if line.strip()} + files_to_process = set() if not test_all: - files_to_test = error_files + files_to_process = error_files else: - files_to_test = set(os.listdir(test_dir_path)) - # also include option to not test successful - - for path in files_to_test: - try: + # Use os.scandir for efficiency and to filter out directories + files_to_process = { + entry.name + for entry in os.scandir(test_dir_path) + if entry.is_file() + } + # TODO: make standard testing methods + # The original code had a commented section for skipping successful files. + # If that functionality is desired, it would be implemented here. + # if os.path.exists(SUCCESS_FILES_LOG): + # with open(SUCCESS_FILES_LOG, "r", encoding="utf-8") as file: + # successful_files = {line.strip() for line in file if line.strip()} + # files_to_process -= successful_files + + # Sort files for consistent processing order, useful for debugging and reproducibility + files_to_process_sorted = sorted(list(files_to_process)) + + # Use 'with' statements for log files to ensure they are properly closed + with open(FAILED_FILES_LOG, "w", encoding="utf-8") as failed_log, \ + open(SUCCESS_FILES_LOG, "w", encoding="utf-8") as successful_log: + + for path in files_to_process_sorted: filepath = os.path.join(test_dir_path, path) - logger.info("Testing %s\n", path) - process_single_file(filepath, path) - successful.write(path + "\n") - except Exception: - failed.write(path + "\n") - logger.exception( - "Error while processing file %s", path - ) - continue - - failed.close() - successful.close() + + # Double-check if the path points to an actual file + if not os.path.isfile(filepath): + logger.warning(f"Skipping non-file entry: {filepath}") + failed_log.write(path + "\n") # Log non-files as failed to prevent re-attempting + continue + + try: + logger.info(f"Testing {path}") + process_single_file(filepath, path) + successful_log.write(path + "\n") + except Exception: # Catching bare Exception is generally discouraged but kept for original intent + failed_log.write(path + "\n") + logger.exception(f"Error while processing file {path}") + continue if __name__ == "__main__":