JS12540 · JS12540 · Dec 6, 2025 · Dec 6, 2025
diff --git a/.github/scripts/update_version.py b/.github/scripts/update_version.py
@@ -27,13 +27,20 @@ def main():
     bump_type = sys.argv[1] if len(sys.argv) > 1 else "patch"
     path = "pyproject.toml"
 
-    with open(path) as f:
-        content = f.read()
+    try:
+        with open(path, 'r') as f:
+            content = f.read()
+    except FileNotFoundError:
+        print(f"::error:: File not found: {path}")
+        sys.exit(1)
+    except IOError as e:
+        print(f"::error:: Error reading file {path}: {e}")
+        sys.exit(1)
 
     match = re.search(r'version\s*=\s*"(\d+)\.(\d+)\.(\d+)"', content)
     if not match:
-        print("Version not found in pyproject.toml")
-        return
+        print(f"::error:: Version not found in {path}")
+        sys.exit(1)
 
     current_version = f"{match.group(1)}.{match.group(2)}.{match.group(3)}"
     new_version = bump_version(current_version, bump_type)
@@ -46,11 +53,22 @@ def main():
         content,
     )
 
-    with open(path, "w") as f:
-        f.write(new_content)
+    try:
+        with open(path, "w") as f:
+            f.write(new_content)
+    except IOError as e:
+        print(f"::error:: Error writing to file {path}: {e}")
+        sys.exit(1)
 
-    with open(os.environ["GITHUB_OUTPUT"], "a") as gh_out:
-        gh_out.write(f"new_version={new_version}\n")
+    github_output_path = os.environ.get("GITHUB_OUTPUT")
+    if github_output_path:
+        try:
+            with open(github_output_path, "a") as gh_out:
+                gh_out.write(f"new_version={new_version}\n")
+        except IOError as e:
+            print(f"::warning:: Error writing to GITHUB_OUTPUT ({github_output_path}): {e}")
+    else:
+        print("::warning:: GITHUB_OUTPUT environment variable not set. Cannot output new_version.")
 
 
 if __name__ == "__main__":

diff --git a/concall_parser/agents/classify.py b/concall_parser/agents/classify.py
@@ -13,16 +13,16 @@
 
 Response should be in json format for opening and end, like this:
 {
-    "intent": "opening"
-    "reasoning": Provide a reasoning for the intent
+    "intent": "opening",
+    "reasoning": "Provide a reasoning for the intent"
 }
 
 If it's new_analyst_start, response should be in json format like this:
 {
     "intent": "new_analyst_start",
     "analyst_name":"analyst_name present in the moderator statement",
-    "analyst_company:""analyst_company present in the moderator statement"
-    "reasoning": Provide a reasoning for the intent
+    "analyst_company": "analyst_company present in the moderator statement",
+    "reasoning": "Provide a reasoning for the intent"
 }
 
 EXAMPLES:
@@ -32,7 +32,7 @@
 
 Response:
 {
-    "intent": "opening"
+    "intent": "opening",
     "reasoning": "From the moderator statement, it's the start of the call, as the moderator is welcoming everyone to the concall."
 }
 
@@ -42,17 +42,17 @@
 
 Response:
 {
-    "intent": "new_analyst_start"
-    "analyst_name": "Mukesh Saraf"
-    "analyst_company": "Avendus Spark"
+    "intent": "new_analyst_start",
+    "analyst_name": "Mukesh Saraf",
+    "analyst_company": "Avendus Spark",
     "reasoning": "From the moderator statement, it's introducing an analyst from a new company to start the Q&A session."
 }
 
 Moderator statement: "Shall we go for the closing, sir?"
 
 Response:
 {
-    "intent": "end"
+    "intent": "end",
     "reasoning": "From the moderator statement, it's closing the call."
 }
 """  # noqa

diff --git a/concall_parser/agents/extraction.py b/concall_parser/agents/extraction.py
@@ -64,7 +64,7 @@
 Kunal Dhamesha
 Disclaimer
 Currently, 34 wells have been put on stream
-\u2013 Managing Director and Chief Executive Officer, Siemens Limited - Thank you very much and all the best and a very happy year ahead.
+– Managing Director and Chief Executive Officer, Siemens Limited - Thank you very much and all the best and a very happy year ahead.
 
 
 Output:
@@ -98,17 +98,21 @@ def process(page_text: str, groq_model: str) -> str:
         Returns:
             None
         """
-        # TODO: context selection logic is wrong, recheck
-        if page_text != "":
+        # TODO: context selection logic is wrong, recheck.
+        # The current logic switches context if page_text is empty, which is likely not
+        # the intended behavior for SPEAKER_SELECTION_CONTEXT. An empty page_text
+        # should probably result in an empty response or an error.
+        if page_text:  # Pythonic way to check for non-empty string
             messages = [
                 {"role": "system", "content": CONTEXT},
                 {"role": "user", "content": page_text},
             ]
         else:
-            messages = [
-                {"role": "system", "content": SPEAKER_SELECTION_CONTEXT},
-                {"role": "user", "content": page_text},
-            ]
+            # This branch is reached if page_text is empty.
+            # Using SPEAKER_SELECTION_CONTEXT with an empty user message is likely incorrect.
+            # Consider returning an empty dict or raising an error here.
+            logger.warning("Received empty page_text for extraction. Returning empty response.")
+            return "{}"  # Returning an empty JSON string as per "If no management information is found, return an empty dict: {}."
 
         # TODO: update data model of response in case of speaker selection
         # TODO: add company name fix in case of speaker selection
@@ -119,3 +123,4 @@ def process(page_text: str, groq_model: str) -> str:
             logger.exception(
                 "Could not get groq response for management extraction"
             )
+            return "{}"  # Ensure a consistent return type even on error
diff --git a/concall_parser/agents/verify_speakers.py b/concall_parser/agents/verify_speakers.py
@@ -80,7 +80,7 @@ class VerifySpeakerNames:
     """Finds actual names from extracted speaker pattern."""
 
     @staticmethod
-    def process(speakers: str, groq_model: str):
+    def process(speakers: str, groq_model: str) -> str:
         """Returns the actual names out of all the speaker pattern matches provided.
 
         Args:

diff --git a/concall_parser/extractors/management.py b/concall_parser/extractors/management.py
@@ -15,6 +15,9 @@ def extract(self, text: str, groq_model: str) -> dict:
                 page_text=text, groq_model=groq_model
             )
             return json.loads(response)
+        except json.JSONDecodeError:
+            logger.exception("Failed to decode JSON response from management extraction.")
+            return {}
         except Exception:
-            logger.exception("Failed to extract management team.")
+            logger.exception("An unexpected error occurred during management extraction.")
             return {}
diff --git a/concall_parser/extractors/management_case_extractor.py b/concall_parser/extractors/management_case_extractor.py
@@ -5,7 +5,7 @@
 
 class ManagementCaseExtractor:
     """Handles case where moderator is not present."""
-    def extract(self, transcript:dict[int,str]):
+    def extract(self, transcript: dict[str, str]):
         """Extracts speaker names and their corresponding speeches from the transcript.
 
         To be used when moderator is not present in transcript.
@@ -29,8 +29,8 @@ def extract(self, transcript:dict[int,str]):
 
             for initial, name, speech in matches:
                 speaker = (
-                    f"{(initial or '').strip()} {name.strip()}".strip()
-                )  # Clean speaker name
+                    f"{(initial or '').strip()} {name.strip()}"
+                ).strip()  # Clean speaker name
                 speech = re.sub(r"\n", " ", speech).strip()  # Clean speech text
 
                 if speaker not in all_speakers:
@@ -40,4 +40,4 @@ def extract(self, transcript:dict[int,str]):
                 speech_pair[speaker].append(speech)
 
         logger.debug(f"Extracted Speakers: {all_speakers}")
-        return speech_pair
+        return speech_pair
diff --git a/concall_parser/parser.py b/concall_parser/parser.py
@@ -62,10 +62,10 @@ def _get_document_transcript(self, filepath: str, link: str) -> dict[int, str]:
             transcript: Dictionary of page number, page text pair.
 
         Raises:
-            Exception in case neither of filepath or link are provided.
+            ValueError: In case neither of filepath or link are provided.
         """
         if not (filepath or link):
-            raise Exception(
+            raise ValueError(
                 "Concall source cannot be empty. Provide filepath or link to concall."
             )
 

diff --git a/concall_parser/utils/file_utils.py b/concall_parser/utils/file_utils.py
@@ -1,5 +1,7 @@
 import json
 import os
+import tempfile
+from pathlib import Path
 
 import pdfplumber
 import requests
@@ -19,7 +21,7 @@ def get_document_transcript(filepath: str) -> dict[int, str]:
     transcript = {}
     try:
         with pdfplumber.open(filepath) as pdf:
-            logger.debug("Loaded document")
+            logger.debug("Loaded document: %s", filepath)
             page_number = 1
             for page in pdf.pages:
                 text = page.extract_text()
@@ -28,9 +30,14 @@ def get_document_transcript(filepath: str) -> dict[int, str]:
                     page_number += 1
         return transcript
     except FileNotFoundError:
-        raise FileNotFoundError("Please check if file exists.")
-    except Exception:
-        logger.exception("Could not load file %s", filepath)
+        logger.error("File not found: %s", filepath)
+        raise FileNotFoundError(f"Please check if file exists: {filepath}")
+    except (pdfplumber.PDFSyntaxError, pdfplumber.PDFDataError) as e:
+        logger.exception("Error parsing PDF file %s: %s", filepath, e)
+        raise ValueError(f"Error parsing PDF file: {filepath}") from e
+    except Exception as e:
+        logger.exception("Could not load file %s: %s", filepath, e)
+        raise # Re-raise the exception after logging
 
 
 def save_output(
@@ -46,15 +53,22 @@ def save_output(
         output_base_path (str): Path to directory in which outputs are to be saved.
         document_name (str): Name of the file being parsed, corresponds to company name for now.
     """
+    # Use pathlib for robust path handling
+    output_base_path_obj = Path(output_base_path)
+    document_stem = Path(document_name).stem # Get filename without extension
+
+    output_dir_path_obj = output_base_path_obj / document_stem
+    output_dir_path_obj.mkdir(parents=True, exist_ok=True)
+
     for dialogue_type, dialogue in dialogues.items():
-        output_dir_path = os.path.join(
-            output_base_path, os.path.basename(document_name)[:-4]
-        )
-        os.makedirs(output_dir_path, exist_ok=True)
-        with open(
-            os.path.join(output_dir_path, f"{dialogue_type}.json"), "w"
-        ) as file:
-            json.dump(dialogue, file, indent=4)
+        output_file_path = output_dir_path_obj / f"{dialogue_type}.json"
+        try:
+            with open(output_file_path, "w", encoding="utf-8") as file:
+                json.dump(dialogue, file, indent=4)
+            logger.debug("Saved %s to %s", dialogue_type, output_file_path)
+        except OSError as e:
+            logger.exception("Could not save dialogue type %s to %s: %s", dialogue_type, output_file_path, e)
+            raise # Re-raise after logging
 
 
 def save_transcript(
@@ -72,16 +86,23 @@ def save_transcript(
         output_base_path (str): Path of directory where transcripts are to be saved.
     """
     try:
-        document_name = os.path.basename(document_path)[:-4]  # remove the .pdf
-        output_dir_path = os.path.join(output_base_path, document_name)
-        os.makedirs(output_base_path, exist_ok=True)
-        with open(f"{output_dir_path}.txt", "w") as file:
+        output_base_path_obj = Path(output_base_path)
+        output_base_path_obj.mkdir(parents=True, exist_ok=True)
+
+        document_name_stem = Path(document_path).stem  # Get filename without extension
+        output_file_path = output_base_path_obj / f"{document_name_stem}.txt"
+
+        with open(output_file_path, "w", encoding="utf-8") as file:
             for _, text in transcript.items():
                 file.write(text)
                 file.write("\n\n")
-        logger.info("Saved transcript text to file\n")
-    except Exception:
-        logger.exception("Could not save document transcript")
+        logger.info("Saved transcript text to file: %s", output_file_path)
+    except OSError as e:
+        logger.exception("Could not save document transcript to %s: %s", output_file_path, e)
+        raise # Re-raise after logging
+    except Exception as e: # Catch any other unexpected errors
+        logger.exception("An unexpected error occurred while saving transcript: %s", e)
+        raise
 
 
 def get_transcript_from_link(link:str) -> dict[int, str]:
@@ -96,23 +117,42 @@ def get_transcript_from_link(link:str) -> dict[int, str]:
     Raises:
         Http error, if encountered during downloading document.
     """
+    transcript = dict()
+    temp_doc_path = None # Initialize to None for finally block
     try:
-        logger.debug("Request to get transcript from link.")
+        logger.debug("Request to get transcript from link: %s", link)
 
         headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"# noqa: E501
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa: E501
         }
-        response = requests.get(url=link, headers=headers, timeout=30, stream=True)
+        # Use a higher timeout for potentially large PDF downloads
+        response = requests.get(url=link, headers=headers, timeout=60, stream=True)
         response.raise_for_status()
 
-        temp_doc_path = "temp_document.pdf"
-        with open(temp_doc_path, 'wb') as temp_pdf:
+        # Use tempfile for secure and automatic cleanup of temporary files
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
+            temp_doc_path = Path(temp_pdf.name)
             for chunk in response.iter_content(chunk_size=8192):
                 temp_pdf.write(chunk)
-        transcript = get_document_transcript(filepath=temp_doc_path)
-        os.remove(temp_doc_path)
+            logger.debug("Downloaded PDF to temporary file: %s", temp_doc_path)
 
+        transcript = get_document_transcript(filepath=str(temp_doc_path))
         return transcript
-    except Exception:
-        logger.exception("Could not get transcript from link")
-        return dict()
+    except requests.exceptions.RequestException as e:
+        logger.exception("HTTP/Network error while getting transcript from link %s: %s", link, e)
+        # Optionally re-raise a more specific custom exception if needed by calling code
+        raise ConnectionError(f"Failed to download PDF from {link}") from e
+    except (OSError, ValueError) as e: # Catch errors from file operations or PDF parsing
+        logger.exception("File/PDF processing error after downloading from link %s: %s", link, e)
+        raise
+    except Exception as e:
+        logger.exception("An unexpected error occurred while getting transcript from link %s: %s", link, e)
+        raise
+    finally:
+        # Ensure the temporary file is cleaned up, even if errors occur
+        if temp_doc_path and temp_doc_path.exists(): # Check if path was assigned and exists
+            try:
+                os.remove(temp_doc_path)
+                logger.debug("Cleaned up temporary file: %s", temp_doc_path)
+            except OSError as e:
+                logger.warning("Could not remove temporary file %s: %s", temp_doc_path, e)
diff --git a/concall_parser/utils/get_groq_responses.py b/concall_parser/utils/get_groq_responses.py
@@ -1,3 +1,5 @@
+from typing import List, Dict, Any
+
 from groq import APIStatusError, Groq
 
 from concall_parser.config import get_groq_api_key
@@ -6,7 +8,7 @@
 client = Groq(api_key=get_groq_api_key())
 
 
-def get_groq_response(messages, model):
+def get_groq_response(messages: List[Dict[str, str]], model: str) -> str | None:
     """Get response from Groq API."""
     try:
         response = client.chat.completions.create(

diff --git a/dev-requirements.txt b/dev-requirements.txt
@@ -1,7 +1,7 @@
-pre-commit==3.7.0
-pytest==8.3.5
-pytest-regressions==2.7.0
-ruff==0.4.1
-python-dotenv==1.1.0
-groq==0.22.0
-requests==2.32.2
+pre-commit==4.5.0
+pytest==9.0.1
+pytest-regressions==2.8.3
+ruff==0.14.8
+python-dotenv==1.2.1
+groq==0.37.0
+requests==2.32.5