Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 26 additions & 8 deletions .github/scripts/update_version.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,20 @@ def main():
bump_type = sys.argv[1] if len(sys.argv) > 1 else "patch"
path = "pyproject.toml"

with open(path) as f:
content = f.read()
try:
with open(path, 'r') as f:
content = f.read()
except FileNotFoundError:
print(f"::error:: File not found: {path}")
sys.exit(1)
except IOError as e:
print(f"::error:: Error reading file {path}: {e}")
sys.exit(1)

match = re.search(r'version\s*=\s*"(\d+)\.(\d+)\.(\d+)"', content)
if not match:
print("Version not found in pyproject.toml")
return
print(f"::error:: Version not found in {path}")
sys.exit(1)

current_version = f"{match.group(1)}.{match.group(2)}.{match.group(3)}"
new_version = bump_version(current_version, bump_type)
Expand All @@ -46,11 +53,22 @@ def main():
content,
)

with open(path, "w") as f:
f.write(new_content)
try:
with open(path, "w") as f:
f.write(new_content)
except IOError as e:
print(f"::error:: Error writing to file {path}: {e}")
sys.exit(1)

with open(os.environ["GITHUB_OUTPUT"], "a") as gh_out:
gh_out.write(f"new_version={new_version}\n")
github_output_path = os.environ.get("GITHUB_OUTPUT")
if github_output_path:
try:
with open(github_output_path, "a") as gh_out:
gh_out.write(f"new_version={new_version}\n")
except IOError as e:
print(f"::warning:: Error writing to GITHUB_OUTPUT ({github_output_path}): {e}")
else:
print("::warning:: GITHUB_OUTPUT environment variable not set. Cannot output new_version.")


if __name__ == "__main__":
Expand Down
18 changes: 9 additions & 9 deletions concall_parser/agents/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,16 +13,16 @@

Response should be in json format for opening and end, like this:
{
"intent": "opening"
"reasoning": Provide a reasoning for the intent
"intent": "opening",
"reasoning": "Provide a reasoning for the intent"
}

If it's new_analyst_start, response should be in json format like this:
{
"intent": "new_analyst_start",
"analyst_name":"analyst_name present in the moderator statement",
"analyst_company:""analyst_company present in the moderator statement"
"reasoning": Provide a reasoning for the intent
"analyst_company": "analyst_company present in the moderator statement",
"reasoning": "Provide a reasoning for the intent"
}

EXAMPLES:
Expand All @@ -32,7 +32,7 @@

Response:
{
"intent": "opening"
"intent": "opening",
"reasoning": "From the moderator statement, it's the start of the call, as the moderator is welcoming everyone to the concall."
}

Expand All @@ -42,17 +42,17 @@

Response:
{
"intent": "new_analyst_start"
"analyst_name": "Mukesh Saraf"
"analyst_company": "Avendus Spark"
"intent": "new_analyst_start",
"analyst_name": "Mukesh Saraf",
"analyst_company": "Avendus Spark",
"reasoning": "From the moderator statement, it's introducing an analyst from a new company to start the Q&A session."
}

Moderator statement: "Shall we go for the closing, sir?"

Response:
{
"intent": "end"
"intent": "end",
"reasoning": "From the moderator statement, it's closing the call."
}
""" # noqa
Expand Down
19 changes: 12 additions & 7 deletions concall_parser/agents/extraction.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@
Kunal Dhamesha
Disclaimer
Currently, 34 wells have been put on stream
\u2013 Managing Director and Chief Executive Officer, Siemens Limited - Thank you very much and all the best and a very happy year ahead.
Managing Director and Chief Executive Officer, Siemens Limited - Thank you very much and all the best and a very happy year ahead.


Output:
Expand Down Expand Up @@ -98,17 +98,21 @@ def process(page_text: str, groq_model: str) -> str:
Returns:
None
"""
# TODO: context selection logic is wrong, recheck
if page_text != "":
# TODO: context selection logic is wrong, recheck.
# The current logic switches context if page_text is empty, which is likely not
# the intended behavior for SPEAKER_SELECTION_CONTEXT. An empty page_text
# should probably result in an empty response or an error.
if page_text: # Pythonic way to check for non-empty string
messages = [
{"role": "system", "content": CONTEXT},
{"role": "user", "content": page_text},
]
else:
messages = [
{"role": "system", "content": SPEAKER_SELECTION_CONTEXT},
{"role": "user", "content": page_text},
]
# This branch is reached if page_text is empty.
# Using SPEAKER_SELECTION_CONTEXT with an empty user message is likely incorrect.
# Consider returning an empty dict or raising an error here.
logger.warning("Received empty page_text for extraction. Returning empty response.")
return "{}" # Returning an empty JSON string as per "If no management information is found, return an empty dict: {}."

# TODO: update data model of response in case of speaker selection
# TODO: add company name fix in case of speaker selection
Expand All @@ -119,3 +123,4 @@ def process(page_text: str, groq_model: str) -> str:
logger.exception(
"Could not get groq response for management extraction"
)
return "{}" # Ensure a consistent return type even on error
2 changes: 1 addition & 1 deletion concall_parser/agents/verify_speakers.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ class VerifySpeakerNames:
"""Finds actual names from extracted speaker pattern."""

@staticmethod
def process(speakers: str, groq_model: str):
def process(speakers: str, groq_model: str) -> str:
"""Returns the actual names out of all the speaker pattern matches provided.

Args:
Expand Down
5 changes: 4 additions & 1 deletion concall_parser/extractors/management.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ def extract(self, text: str, groq_model: str) -> dict:
page_text=text, groq_model=groq_model
)
return json.loads(response)
except json.JSONDecodeError:
logger.exception("Failed to decode JSON response from management extraction.")
return {}
except Exception:
logger.exception("Failed to extract management team.")
logger.exception("An unexpected error occurred during management extraction.")
return {}
8 changes: 4 additions & 4 deletions concall_parser/extractors/management_case_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

class ManagementCaseExtractor:
"""Handles case where moderator is not present."""
def extract(self, transcript:dict[int,str]):
def extract(self, transcript: dict[str, str]):
"""Extracts speaker names and their corresponding speeches from the transcript.

To be used when moderator is not present in transcript.
Expand All @@ -29,8 +29,8 @@ def extract(self, transcript:dict[int,str]):

for initial, name, speech in matches:
speaker = (
f"{(initial or '').strip()} {name.strip()}".strip()
) # Clean speaker name
f"{(initial or '').strip()} {name.strip()}"
).strip() # Clean speaker name
speech = re.sub(r"\n", " ", speech).strip() # Clean speech text

if speaker not in all_speakers:
Expand All @@ -40,4 +40,4 @@ def extract(self, transcript:dict[int,str]):
speech_pair[speaker].append(speech)

logger.debug(f"Extracted Speakers: {all_speakers}")
return speech_pair
return speech_pair
4 changes: 2 additions & 2 deletions concall_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,10 @@ def _get_document_transcript(self, filepath: str, link: str) -> dict[int, str]:
transcript: Dictionary of page number, page text pair.

Raises:
Exception in case neither of filepath or link are provided.
ValueError: In case neither of filepath or link are provided.
"""
if not (filepath or link):
raise Exception(
raise ValueError(
"Concall source cannot be empty. Provide filepath or link to concall."
)

Expand Down
98 changes: 69 additions & 29 deletions concall_parser/utils/file_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json
import os
import tempfile
from pathlib import Path

import pdfplumber
import requests
Expand All @@ -19,7 +21,7 @@ def get_document_transcript(filepath: str) -> dict[int, str]:
transcript = {}
try:
with pdfplumber.open(filepath) as pdf:
logger.debug("Loaded document")
logger.debug("Loaded document: %s", filepath)
page_number = 1
for page in pdf.pages:
text = page.extract_text()
Expand All @@ -28,9 +30,14 @@ def get_document_transcript(filepath: str) -> dict[int, str]:
page_number += 1
return transcript
except FileNotFoundError:
raise FileNotFoundError("Please check if file exists.")
except Exception:
logger.exception("Could not load file %s", filepath)
logger.error("File not found: %s", filepath)
raise FileNotFoundError(f"Please check if file exists: {filepath}")
except (pdfplumber.PDFSyntaxError, pdfplumber.PDFDataError) as e:
logger.exception("Error parsing PDF file %s: %s", filepath, e)
raise ValueError(f"Error parsing PDF file: {filepath}") from e
except Exception as e:
logger.exception("Could not load file %s: %s", filepath, e)
raise # Re-raise the exception after logging


def save_output(
Expand All @@ -46,15 +53,22 @@ def save_output(
output_base_path (str): Path to directory in which outputs are to be saved.
document_name (str): Name of the file being parsed, corresponds to company name for now.
"""
# Use pathlib for robust path handling
output_base_path_obj = Path(output_base_path)
document_stem = Path(document_name).stem # Get filename without extension

output_dir_path_obj = output_base_path_obj / document_stem
output_dir_path_obj.mkdir(parents=True, exist_ok=True)

for dialogue_type, dialogue in dialogues.items():
output_dir_path = os.path.join(
output_base_path, os.path.basename(document_name)[:-4]
)
os.makedirs(output_dir_path, exist_ok=True)
with open(
os.path.join(output_dir_path, f"{dialogue_type}.json"), "w"
) as file:
json.dump(dialogue, file, indent=4)
output_file_path = output_dir_path_obj / f"{dialogue_type}.json"
try:
with open(output_file_path, "w", encoding="utf-8") as file:
json.dump(dialogue, file, indent=4)
logger.debug("Saved %s to %s", dialogue_type, output_file_path)
except OSError as e:
logger.exception("Could not save dialogue type %s to %s: %s", dialogue_type, output_file_path, e)
raise # Re-raise after logging


def save_transcript(
Expand All @@ -72,16 +86,23 @@ def save_transcript(
output_base_path (str): Path of directory where transcripts are to be saved.
"""
try:
document_name = os.path.basename(document_path)[:-4] # remove the .pdf
output_dir_path = os.path.join(output_base_path, document_name)
os.makedirs(output_base_path, exist_ok=True)
with open(f"{output_dir_path}.txt", "w") as file:
output_base_path_obj = Path(output_base_path)
output_base_path_obj.mkdir(parents=True, exist_ok=True)

document_name_stem = Path(document_path).stem # Get filename without extension
output_file_path = output_base_path_obj / f"{document_name_stem}.txt"

with open(output_file_path, "w", encoding="utf-8") as file:
for _, text in transcript.items():
file.write(text)
file.write("\n\n")
logger.info("Saved transcript text to file\n")
except Exception:
logger.exception("Could not save document transcript")
logger.info("Saved transcript text to file: %s", output_file_path)
except OSError as e:
logger.exception("Could not save document transcript to %s: %s", output_file_path, e)
raise # Re-raise after logging
except Exception as e: # Catch any other unexpected errors
logger.exception("An unexpected error occurred while saving transcript: %s", e)
raise


def get_transcript_from_link(link:str) -> dict[int, str]:
Expand All @@ -96,23 +117,42 @@ def get_transcript_from_link(link:str) -> dict[int, str]:
Raises:
Http error, if encountered during downloading document.
"""
transcript = dict()
temp_doc_path = None # Initialize to None for finally block
try:
logger.debug("Request to get transcript from link.")
logger.debug("Request to get transcript from link: %s", link)

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"# noqa: E501
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa: E501
}
response = requests.get(url=link, headers=headers, timeout=30, stream=True)
# Use a higher timeout for potentially large PDF downloads
response = requests.get(url=link, headers=headers, timeout=60, stream=True)
response.raise_for_status()

temp_doc_path = "temp_document.pdf"
with open(temp_doc_path, 'wb') as temp_pdf:
# Use tempfile for secure and automatic cleanup of temporary files
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_pdf:
temp_doc_path = Path(temp_pdf.name)
for chunk in response.iter_content(chunk_size=8192):
temp_pdf.write(chunk)
transcript = get_document_transcript(filepath=temp_doc_path)
os.remove(temp_doc_path)
logger.debug("Downloaded PDF to temporary file: %s", temp_doc_path)

transcript = get_document_transcript(filepath=str(temp_doc_path))
return transcript
except Exception:
logger.exception("Could not get transcript from link")
return dict()
except requests.exceptions.RequestException as e:
logger.exception("HTTP/Network error while getting transcript from link %s: %s", link, e)
# Optionally re-raise a more specific custom exception if needed by calling code
raise ConnectionError(f"Failed to download PDF from {link}") from e
except (OSError, ValueError) as e: # Catch errors from file operations or PDF parsing
logger.exception("File/PDF processing error after downloading from link %s: %s", link, e)
raise
except Exception as e:
logger.exception("An unexpected error occurred while getting transcript from link %s: %s", link, e)
raise
finally:
# Ensure the temporary file is cleaned up, even if errors occur
if temp_doc_path and temp_doc_path.exists(): # Check if path was assigned and exists
try:
os.remove(temp_doc_path)
logger.debug("Cleaned up temporary file: %s", temp_doc_path)
except OSError as e:
logger.warning("Could not remove temporary file %s: %s", temp_doc_path, e)
4 changes: 3 additions & 1 deletion concall_parser/utils/get_groq_responses.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import List, Dict, Any

from groq import APIStatusError, Groq

from concall_parser.config import get_groq_api_key
Expand All @@ -6,7 +8,7 @@
client = Groq(api_key=get_groq_api_key())


def get_groq_response(messages, model):
def get_groq_response(messages: List[Dict[str, str]], model: str) -> str | None:
"""Get response from Groq API."""
try:
response = client.chat.completions.create(
Expand Down
14 changes: 7 additions & 7 deletions dev-requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
pre-commit==3.7.0
pytest==8.3.5
pytest-regressions==2.7.0
ruff==0.4.1
python-dotenv==1.1.0
groq==0.22.0
requests==2.32.2
pre-commit==4.5.0
pytest==9.0.1
pytest-regressions==2.8.3
ruff==0.14.8
python-dotenv==1.2.1
groq==0.37.0
requests==2.32.5
Loading
Loading