Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 12 additions & 12 deletions .github/scripts/update_version.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import re
import sys
from pathlib import Path


def bump_version(version: str, bump_type: str) -> str:
Expand All @@ -25,21 +26,20 @@ def bump_version(version: str, bump_type: str) -> str:
def main():
"""Main function for incrementing the version."""
bump_type = sys.argv[1] if len(sys.argv) > 1 else "patch"
path = "pyproject.toml"
toml_path = Path("pyproject.toml")

try:
with open(path, 'r') as f:
content = f.read()
content = toml_path.read_text()
except FileNotFoundError:
print(f"::error:: File not found: {path}")
print(f"::error:: File not found: {toml_path}")
sys.exit(1)
except IOError as e:
print(f"::error:: Error reading file {path}: {e}")
print(f"::error:: Error reading file {toml_path}: {e}")
sys.exit(1)

match = re.search(r'version\s*=\s*"(\d+)\.(\d+)\.(\d+)"', content)
if not match:
print(f"::error:: Version not found in {path}")
print(f"::error:: Version not found in {toml_path}")
sys.exit(1)

current_version = f"{match.group(1)}.{match.group(2)}.{match.group(3)}"
Expand All @@ -54,16 +54,16 @@ def main():
)

try:
with open(path, "w") as f:
f.write(new_content)
toml_path.write_text(new_content)
except IOError as e:
print(f"::error:: Error writing to file {path}: {e}")
print(f"::error:: Error writing to file {toml_path}: {e}")
sys.exit(1)

github_output_path = os.environ.get("GITHUB_OUTPUT")
if github_output_path:
github_output_path_str = os.environ.get("GITHUB_OUTPUT")
if github_output_path_str:
github_output_path = Path(github_output_path_str)
try:
with open(github_output_path, "a") as gh_out:
with github_output_path.open("a") as gh_out:
gh_out.write(f"new_version={new_version}\n")
except IOError as e:
print(f"::warning:: Error writing to GITHUB_OUTPUT ({github_output_path}): {e}")
Expand Down
3 changes: 3 additions & 0 deletions concall_parser/agents/check_moderator.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,10 +81,13 @@ def process(page_text: str, groq_model: str) -> str:
{"role": "system", "content": CONTEXT},
{"role": "user", "content": page_text},
]
response = '{"moderator": ""}' # Initialize with a default value to prevent UnboundLocalError
try:
response = get_groq_response(messages=messages, model=groq_model)
except Exception:
logger.exception(
"Could not get groq response for management extraction"
)
# If an exception occurs, the initialized default response will be returned.
# This ensures the function always returns a string as per its type hint.
return response
2 changes: 1 addition & 1 deletion concall_parser/agents/classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class ClassifyModeratorIntent:
"""Classify moderator statements into categories."""

@staticmethod
def process(dialogue: str, groq_model: str):
def process(dialogue: str, groq_model: str) -> str:
"""Classify a moderator statement into one of the three categories.

Args:
Expand Down
38 changes: 23 additions & 15 deletions concall_parser/agents/extraction.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from concall_parser.log_config import logger
from concall_parser.utils.get_groq_responses import get_groq_response
import json

# TODO: add second prompt case, for apollo (may be solved using regex but idk)

Expand Down Expand Up @@ -87,7 +88,7 @@ class ExtractManagement:
"""Class to extract management information from a PDF document."""

@staticmethod
def process(page_text: str, groq_model: str) -> str:
def process(page_text: str, groq_model: str) -> dict:
"""Process the given page text to extract relevant management information.

Args:
Expand All @@ -96,31 +97,38 @@ def process(page_text: str, groq_model: str) -> str:
groq_model (str): The model to use for Groq queries.

Returns:
None
dict: A dictionary containing extracted management information,
or an empty dictionary if no information is found or an error occurs.
"""
# TODO: context selection logic is wrong, recheck.
# The current logic switches context if page_text is empty, which is likely not
# the intended behavior for SPEAKER_SELECTION_CONTEXT. An empty page_text
# should probably result in an empty response or an error.
if page_text: # Pythonic way to check for non-empty string
messages = [
{"role": "system", "content": CONTEXT},
{"role": "user", "content": page_text},
]
else:
# This branch is reached if page_text is empty.
# Using SPEAKER_SELECTION_CONTEXT with an empty user message is likely incorrect.
# Consider returning an empty dict or raising an error here.
if not page_text: # More explicit check for empty string
logger.warning("Received empty page_text for extraction. Returning empty response.")
return "{}" # Returning an empty JSON string as per "If no management information is found, return an empty dict: {}."
return {} # Return empty dict directly

# The current implementation always uses CONTEXT. The TODOs indicate a missing
# context selection logic for SPEAKER_SELECTION_CONTEXT.
messages = [
{"role": "system", "content": CONTEXT},
{"role": "user", "content": page_text},
]

# TODO: update data model of response in case of speaker selection
# TODO: add company name fix in case of speaker selection
try:
response = get_groq_response(messages=messages, model=groq_model)
return response
response_str = get_groq_response(messages=messages, model=groq_model)
# Attempt to parse the response as JSON for robustness and consistency
parsed_response = json.loads(response_str)
return parsed_response
except json.JSONDecodeError:
logger.exception(
"Groq response for management extraction was not valid JSON."
)
return {}
except Exception:
logger.exception(
"Could not get groq response for management extraction"
)
return "{}" # Ensure a consistent return type even on error
return {} # Ensure a consistent return type even on error
2 changes: 1 addition & 1 deletion concall_parser/base_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ class BaseExtractor(ABC):
@abstractmethod
def extract(self, *args, **kwargs):
"""Extracts data from the input."""
pass
pass
87 changes: 41 additions & 46 deletions concall_parser/extractors/dialogue_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,19 @@ def _handle_leftover_text(
cleaned = clean_text(leftover_text)

if current_analyst:
self.dialogues["analyst_discussion"][current_analyst]["dialogue"][
-1
]["dialogue"] += f" {cleaned}"
analyst_dialogues = self.dialogues["analyst_discussion"][current_analyst][
"dialogue"
]
if analyst_dialogues:
analyst_dialogues[-1]["dialogue"] += f" {cleaned}"
else:
# If this is the first dialogue for the analyst, treat leftover as their initial statement.
analyst_dialogues.append(
{
"speaker": current_analyst, # Assuming the analyst is speaking
"dialogue": cleaned,
}
)
elif self.dialogues["commentary_and_future_outlook"]:
self.dialogues["commentary_and_future_outlook"][-1]["dialogue"] += (
f" {cleaned}"
Expand Down Expand Up @@ -76,29 +86,28 @@ def _append_dialogue(
}
)

def _process_match(
self, match, groq_model: str, current_analyst: str | None
):
speaker = match.group("speaker").strip()
dialogue = match.group("dialogue")
intent = None

if speaker == "Moderator":
response = json.loads(
ClassifyModeratorIntent.process(
dialogue=dialogue, groq_model=groq_model
)
def _process_moderator_dialogue(
self, dialogue: str, groq_model: str
) -> tuple[str, str | None]:
"""Processes moderator dialogue to classify intent and extract analyst info.
Updates self.dialogues directly for new analyst discussions.
"""
response = json.loads(
ClassifyModeratorIntent.process(
dialogue=dialogue, groq_model=groq_model
)
intent = response["intent"]
if intent == "new_analyst_start":
current_analyst = response["analyst_name"]
self.dialogues["analyst_discussion"][current_analyst] = {
"analyst_company": response["analyst_company"],
"dialogue": [],
}
return intent, current_analyst, None # Moderator handled
)
intent = response["intent"]
current_analyst = None

return intent, current_analyst, (speaker, dialogue)
if intent == "new_analyst_start":
current_analyst = response["analyst_name"]
analyst_company = response["analyst_company"]
self.dialogues["analyst_discussion"][current_analyst] = {
"analyst_company": analyst_company,
"dialogue": [],
}
return intent, current_analyst

def extract_commentary_and_future_outlook(
self, transcript: dict[int, str], groq_model: str
Expand Down Expand Up @@ -126,23 +135,20 @@ def extract_commentary_and_future_outlook(
for match in self.speaker_pattern.finditer(text):
speaker = match.group("speaker").strip()
last_speaker = speaker
dialogue_content = match.group("dialogue")

if speaker == "Moderator":
response = json.loads(
ClassifyModeratorIntent.process(
dialogue=match.group("dialogue"),
groq_model=groq_model,
)
intent, current_analyst = self._process_moderator_dialogue(
dialogue_content, groq_model
)
intent = response["intent"]
if intent == "new_analyst_start":
return self.dialogues["commentary_and_future_outlook"]
continue

if intent == "opening":
self._append_dialogue(
speaker,
match.group("dialogue"),
dialogue_content,
intent,
current_analyst,
)
Expand Down Expand Up @@ -178,30 +184,19 @@ def extract_dialogues(
for match in self.speaker_pattern.finditer(text):
speaker = match.group("speaker").strip()
last_speaker = speaker
dialogue_content = match.group("dialogue")

if speaker == "Moderator":
response = json.loads(
ClassifyModeratorIntent.process(
dialogue=match.group("dialogue"),
groq_model=groq_model,
)
intent, current_analyst = self._process_moderator_dialogue(
dialogue_content, groq_model
)
intent = response["intent"]
if intent == "new_analyst_start":
current_analyst = response["analyst_name"]
self.dialogues["analyst_discussion"][
current_analyst
] = {
"analyst_company": response["analyst_company"],
"dialogue": [],
}
continue

if intent is None:
break

self._append_dialogue(
speaker, match.group("dialogue"), intent, current_analyst
speaker, dialogue_content, intent, current_analyst
)

return self.dialogues
24 changes: 19 additions & 5 deletions concall_parser/extractors/management_case_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,24 @@

class ManagementCaseExtractor:
"""Handles case where moderator is not present."""

# Pre-compile the regex for performance and readability
SPEAKER_SPEECH_PATTERN = re.compile(
r"""
([A-Z]\.\s)? # Optional initial (e.g., "J. "). Group 1.
([A-Za-z\s]+) # Speaker name (e.g., "John Doe"). Group 2.
:\s # Colon and space separator.
(.*?) # Non-greedy match for the speech content. Group 3.
(?= # Positive lookahead for either:
\s[A-Z]\.?\s? # Another speaker pattern (space, initial, optional dot, optional space,
[A-Za-z\s]+:\s # name, colon, space).
| # OR
$ # End of the string.
)
""",
re.DOTALL | re.VERBOSE,
)

def extract(self, transcript: dict[str, str]):
"""Extracts speaker names and their corresponding speeches from the transcript.

Expand All @@ -21,11 +39,7 @@ def extract(self, transcript: dict[str, str]):
speech_pair: dict[str, list[str]] = {}

for _, text in transcript.items():
matches = re.findall(
r"([A-Z]\.\s)?([A-Za-z\s]+):\s(.*?)(?=\s[A-Z]\.?\s?[A-Za-z\s]+:\s|$)",
text,
re.DOTALL,
)
matches = self.SPEAKER_SPEECH_PATTERN.findall(text)

for initial, name, speech in matches:
speaker = (
Expand Down
2 changes: 1 addition & 1 deletion concall_parser/log_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ def configure_logger(
if save_to_file:
file_handler = logging.FileHandler(log_file)
file_handler.setFormatter(formatter)
logger.addHandler(file_handler)
logger.addHandler(file_handler)
9 changes: 3 additions & 6 deletions concall_parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,17 +70,14 @@ def _get_document_transcript(self, filepath: str, link: str) -> dict[int, str]:
)

if link:
self.transcript = get_transcript_from_link(link=link)
transcript = get_transcript_from_link(link=link)
else:
self.transcript = get_document_transcript(filepath=filepath)
return self.transcript
transcript = get_document_transcript(filepath=filepath)
return transcript

def extract_concall_info(self) -> dict:
"""Extracts company name and management team from the transcript.

Args:
None

Returns:
dict: Company name and management team as a dictionary.
"""
Expand Down
8 changes: 5 additions & 3 deletions concall_parser/utils/file_utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import json
import os
import tempfile
from pathlib import Path

Expand Down Expand Up @@ -123,7 +122,10 @@ def get_transcript_from_link(link:str) -> dict[int, str]:
logger.debug("Request to get transcript from link: %s", link)

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa: E501
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
}
# Use a higher timeout for potentially large PDF downloads
response = requests.get(url=link, headers=headers, timeout=60, stream=True)
Expand Down Expand Up @@ -152,7 +154,7 @@ def get_transcript_from_link(link:str) -> dict[int, str]:
# Ensure the temporary file is cleaned up, even if errors occur
if temp_doc_path and temp_doc_path.exists(): # Check if path was assigned and exists
try:
os.remove(temp_doc_path)
temp_doc_path.unlink()
logger.debug("Cleaned up temporary file: %s", temp_doc_path)
except OSError as e:
logger.warning("Could not remove temporary file %s: %s", temp_doc_path, e)
Loading
Loading