From 6c988926ab5f2f2f5c39d3554f7b020941d5b216 Mon Sep 17 00:00:00 2001 From: pranshu-raj-211 Date: Thu, 17 Jul 2025 16:17:44 +0530 Subject: [PATCH 1/5] updated pyproject.toml to follow PEP 621 standard project now uv and pip compatible, run uv sync to get started with uv, uv sync --extra dev to install dev dependencies as well --- pyproject.toml | 48 +++++++++++++++++++++--------------------------- 1 file changed, 21 insertions(+), 27 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d254bc5..fb51cee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,49 +1,43 @@ -[tool.poetry] +[project] name = "concall-parser" version = "1.0.4" description = "A parser for extracting analyst discussion and management commentary efficiently from concalls." authors = [ - "Jay Shah ", - "Pranshu Raj " -] -maintainers = [ - "Pranshu Raj " + { name = "Jay Shah", email = "jayshah0726@gmail.com" }, + { name = "Pranshu Raj", email = "pranshuraj65536@gmail.com" }, ] +maintainers = [{ name = "Pranshu Raj", email = "pranshuraj65536@gmail.com" }] readme = "README.md" -packages = [ - { include = "concall_parser", from = "." }, +requires-python = ">=3.10" +dependencies = [ + "groq==0.22.0", + "pdfplumber==0.11.5", + "python-dotenv==1.1.0", + "requests==2.32.2", ] -[tool.poetry.dependencies] -python = "^3.10" -groq = "0.22.0" -pdfplumber = "0.11.5" -python-dotenv = "1.1.0" -requests = "2.32.2" - -[tool.poetry.group.dev.dependencies] -ruff = "0.4.1" -pre-commit = "3.7.0" +[project.optional-dependencies] +dev = ["ruff==0.4.1", "pre-commit==3.7.0"] [build-system] -requires = ["poetry-core"] -build-backend = "poetry.core.masonry.api" +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.build.targets.wheel] +packages = ["concall_parser"] [tool.ruff] line-length = 80 indent-width = 4 target-version = "py310" -extend-exclude = [ - "__init__.py", - "migrations", -] +extend-exclude = ["__init__.py", "migrations"] [tool.ruff.lint] extend-select = [ "UP", # pyupgrade - "E", # pycodestyle - "I", # isort - "D", # pydocstyle + "E", # pycodestyle + "I", # isort + "D", # pydocstyle ] ignore = [ From f71b31c99da8c673cde5780b92d57851a45fc193 Mon Sep 17 00:00:00 2001 From: pranshu-raj-211 Date: Thu, 17 Jul 2025 21:37:47 +0530 Subject: [PATCH 2/5] replace requests with httpx benefits from async http requests, no need to handle requests vulnerability now --- concall_parser/utils/file_utils.py | 5 +++-- pyproject.toml | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/concall_parser/utils/file_utils.py b/concall_parser/utils/file_utils.py index de70096..e034ee7 100644 --- a/concall_parser/utils/file_utils.py +++ b/concall_parser/utils/file_utils.py @@ -1,8 +1,8 @@ import json import os +import httpx import pdfplumber -import requests from concall_parser.log_config import logger @@ -102,7 +102,8 @@ def get_transcript_from_link(link:str) -> dict[int, str]: headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"# noqa: E501 } - response = requests.get(url=link, headers=headers, timeout=30, stream=True) + async with httpx.AsyncClient(headers=headers) as client: + response = await client.get(url=link, timeout=30) response.raise_for_status() temp_doc_path = "temp_document.pdf" diff --git a/pyproject.toml b/pyproject.toml index fb51cee..8d21764 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,9 +11,9 @@ readme = "README.md" requires-python = ">=3.10" dependencies = [ "groq==0.22.0", + "httpx>=0.28.1", "pdfplumber==0.11.5", "python-dotenv==1.1.0", - "requests==2.32.2", ] [project.optional-dependencies] From 5c026766da2759e23cb3bb7eb15dd6ec8c45e463 Mon Sep 17 00:00:00 2001 From: pranshu-raj-211 Date: Thu, 17 Jul 2025 23:02:23 +0530 Subject: [PATCH 3/5] add todos, logging in tests --- concall_parser/utils/file_utils.py | 22 ++++++++++++---------- tests/test_breaking_changes.py | 6 +++--- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/concall_parser/utils/file_utils.py b/concall_parser/utils/file_utils.py index e034ee7..4c6288c 100644 --- a/concall_parser/utils/file_utils.py +++ b/concall_parser/utils/file_utils.py @@ -7,6 +7,8 @@ from concall_parser.log_config import logger +# TODO: use aiofiles for file operations +# TODO: check out async pdf readers, if not available, use threadpool def get_document_transcript(filepath: str) -> dict[int, str]: """Extracts text of a pdf document. @@ -18,6 +20,7 @@ def get_document_transcript(filepath: str) -> dict[int, str]: """ transcript = {} try: + # TODO: Run this part with async, or in another thread with pdfplumber.open(filepath) as pdf: logger.debug("Loaded document") page_number = 1 @@ -46,14 +49,13 @@ def save_output( output_base_path (str): Path to directory in which outputs are to be saved. document_name (str): Name of the file being parsed, corresponds to company name for now. """ + # TODO: Add error handling for dialogue_type, dialogue in dialogues.items(): output_dir_path = os.path.join( output_base_path, os.path.basename(document_name)[:-4] ) os.makedirs(output_dir_path, exist_ok=True) - with open( - os.path.join(output_dir_path, f"{dialogue_type}.json"), "w" - ) as file: + with open(os.path.join(output_dir_path, f"{dialogue_type}.json"), "w") as file: json.dump(dialogue, file, indent=4) @@ -84,15 +86,15 @@ def save_transcript( logger.exception("Could not save document transcript") -def get_transcript_from_link(link:str) -> dict[int, str]: +def get_transcript_from_link(link: str) -> dict[int, str]: """Extracts transcript by downloading pdf from a given link. - + Args: link: Link to the pdf document of earnings call report. - + Returns: transcript: A page number-page text mapping. - + Raises: Http error, if encountered during downloading document. """ @@ -100,14 +102,14 @@ def get_transcript_from_link(link:str) -> dict[int, str]: logger.debug("Request to get transcript from link.") headers = { - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"# noqa: E501 + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa: E501 } async with httpx.AsyncClient(headers=headers) as client: response = await client.get(url=link, timeout=30) response.raise_for_status() temp_doc_path = "temp_document.pdf" - with open(temp_doc_path, 'wb') as temp_pdf: + with open(temp_doc_path, "wb") as temp_pdf: for chunk in response.iter_content(chunk_size=8192): temp_pdf.write(chunk) transcript = get_document_transcript(filepath=temp_doc_path) @@ -116,4 +118,4 @@ def get_transcript_from_link(link:str) -> dict[int, str]: return transcript except Exception: logger.exception("Could not get transcript from link") - return dict() \ No newline at end of file + return dict() diff --git a/tests/test_breaking_changes.py b/tests/test_breaking_changes.py index 525a7dd..4ddb1b5 100644 --- a/tests/test_breaking_changes.py +++ b/tests/test_breaking_changes.py @@ -1,5 +1,6 @@ import filecmp +from concall_parser.log_config import logger from tests.test_parsing import process_single_file @@ -19,9 +20,9 @@ def test_single_file_processing(filepath, output_dir, expected_output_dir): output_dir, expected_output_dir ), "Output does not match expected" except AssertionError: - print("Broken -- fix it") + logger.exception(f"Failed to process file {filepath}") return - print("Test passed") + logger.info(f"Parsing successful for {filepath}") def test_multiple_files_processing(input_files, output_dir, expected_output_dirs): @@ -30,7 +31,6 @@ def test_multiple_files_processing(input_files, output_dir, expected_output_dirs test_single_file_processing(input_file, output_dir, expected_output_dir) -# TODO: upgrade to pytest-regressions if __name__ == "__main__": test_single_file_processing( filepath="test_documents/ambuja_cement.pdf", From 5e795d7d211181adb23eac2301a45e30dc8e9101 Mon Sep 17 00:00:00 2001 From: pranshu-raj-211 Date: Thu, 17 Jul 2025 23:50:04 +0530 Subject: [PATCH 4/5] started making file utils async made network calls async - httpx, made file operations async - aiofiles, run cpu intensive operations in separate thread, added aiofiles dependency, --- concall_parser/utils/file_utils.py | 89 +++++++++++++++++++----------- pyproject.toml | 1 + 2 files changed, 57 insertions(+), 33 deletions(-) diff --git a/concall_parser/utils/file_utils.py b/concall_parser/utils/file_utils.py index 4c6288c..84f52a8 100644 --- a/concall_parser/utils/file_utils.py +++ b/concall_parser/utils/file_utils.py @@ -1,6 +1,9 @@ +import asyncio import json import os +import tempfile +import aiofiles import httpx import pdfplumber @@ -9,7 +12,7 @@ # TODO: use aiofiles for file operations # TODO: check out async pdf readers, if not available, use threadpool -def get_document_transcript(filepath: str) -> dict[int, str]: +async def get_document_transcript(filepath: str) -> dict[int, str]: """Extracts text of a pdf document. Args: @@ -18,22 +21,30 @@ def get_document_transcript(filepath: str) -> dict[int, str]: Returns: transcript: Dictionary of page number, page text pair. """ - transcript = {} - try: - # TODO: Run this part with async, or in another thread - with pdfplumber.open(filepath) as pdf: - logger.debug("Loaded document") - page_number = 1 - for page in pdf.pages: - text = page.extract_text() - if text: - transcript[page_number] = text - page_number += 1 - return transcript - except FileNotFoundError: - raise FileNotFoundError("Please check if file exists.") - except Exception: - logger.exception("Could not load file %s", filepath) + + def _extract_pdf_text(filepath: str) -> dict[int, str]: + transcript = {} + try: + with pdfplumber.open(filepath) as pdf: + logger.debug(f"Loaded document {filepath}") + # ? Do we need to start a counter? can we not do pdfplumber pages or enumerate? + page_number = 1 + for page in pdf.pages: + text = page.extract_text() + if text: + transcript[page_number] = text + page_number += 1 + return transcript + except FileNotFoundError: + logger.exception( + f"Could not file with path {filepath}. Please check if it exists." + ) + raise FileNotFoundError("Please check if file exists.") + except Exception: + logger.exception("Could not load file %s", filepath) + + loop = asyncio.get_event_loop() + return await loop.run_in_executor(None, _extract_pdf_text, filepath) def save_output( @@ -49,17 +60,20 @@ def save_output( output_base_path (str): Path to directory in which outputs are to be saved. document_name (str): Name of the file being parsed, corresponds to company name for now. """ - # TODO: Add error handling - for dialogue_type, dialogue in dialogues.items(): + try: output_dir_path = os.path.join( output_base_path, os.path.basename(document_name)[:-4] ) os.makedirs(output_dir_path, exist_ok=True) - with open(os.path.join(output_dir_path, f"{dialogue_type}.json"), "w") as file: - json.dump(dialogue, file, indent=4) + for dialogue_type, dialogue in dialogues.items(): + output_file_path = os.path.join(output_dir_path, f"{dialogue_type}.json") + async with aiofiles.open(output_file_path, "w") as file: + await file.write(json.dump(dialogue, indent=4)) + except Exception: + logger.exception(f"Failed to save outputs for file {output_base_path}.") -def save_transcript( +async def save_transcript( transcript: dict, document_path: str, output_base_path: str = "raw_transcript", @@ -77,16 +91,18 @@ def save_transcript( document_name = os.path.basename(document_path)[:-4] # remove the .pdf output_dir_path = os.path.join(output_base_path, document_name) os.makedirs(output_base_path, exist_ok=True) - with open(f"{output_dir_path}.txt", "w") as file: + # ? concatenate all transcript texts before writing at once? IO overhead? + async with aiofiles.open(f"{output_dir_path}.txt", "w") as file: for _, text in transcript.items(): - file.write(text) - file.write("\n\n") + await file.write(text) + await file.write("\n\n") + # ? Do we gather all tasks before asynchronously executing? logger.info("Saved transcript text to file\n") except Exception: logger.exception("Could not save document transcript") -def get_transcript_from_link(link: str) -> dict[int, str]: +async def get_transcript_from_link(link: str) -> dict[int, str]: """Extracts transcript by downloading pdf from a given link. Args: @@ -99,23 +115,30 @@ def get_transcript_from_link(link: str) -> dict[int, str]: Http error, if encountered during downloading document. """ try: + # TODO: expand error handling - file operations logger.debug("Request to get transcript from link.") headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa: E501 } + with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as file: + temp_file_path = file.name + async with httpx.AsyncClient(headers=headers) as client: response = await client.get(url=link, timeout=30) - response.raise_for_status() + response.raise_for_status() - temp_doc_path = "temp_document.pdf" - with open(temp_doc_path, "wb") as temp_pdf: - for chunk in response.iter_content(chunk_size=8192): - temp_pdf.write(chunk) - transcript = get_document_transcript(filepath=temp_doc_path) - os.remove(temp_doc_path) + with aiofiles.open(temp_file_path, "wb") as file: + async for chunk in response.aiter_bytes(chunk_size=8192): + await file.write(chunk) + transcript = await get_document_transcript(filepath=temp_file_path) return transcript + except Exception: logger.exception("Could not get transcript from link") return dict() + + finally: + if os.path.exists(temp_file_path): + os.remove(temp_file_path) diff --git a/pyproject.toml b/pyproject.toml index 8d21764..cd739ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ maintainers = [{ name = "Pranshu Raj", email = "pranshuraj65536@gmail.com" }] readme = "README.md" requires-python = ">=3.10" dependencies = [ + "aiofiles>=24.1.0", "groq==0.22.0", "httpx>=0.28.1", "pdfplumber==0.11.5", From 2882196b75f824d6ecc9fd85f50bde9d677fa04e Mon Sep 17 00:00:00 2001 From: pranshu-raj-211 Date: Sun, 20 Jul 2025 19:51:39 +0530 Subject: [PATCH 5/5] fix file utils errors, add basic perf test for file utils next steps - improve perf script, use config files for testing config, improve structure of performance checks, include failure and success rates Signed-off-by: pranshu-raj-211 --- concall_parser/utils/file_utils.py | 9 +- tests/perf/time_logger.py | 157 +++++++++++++++++++++++++++++ 2 files changed, 161 insertions(+), 5 deletions(-) create mode 100644 tests/perf/time_logger.py diff --git a/concall_parser/utils/file_utils.py b/concall_parser/utils/file_utils.py index 84f52a8..dab3a89 100644 --- a/concall_parser/utils/file_utils.py +++ b/concall_parser/utils/file_utils.py @@ -10,8 +10,6 @@ from concall_parser.log_config import logger -# TODO: use aiofiles for file operations -# TODO: check out async pdf readers, if not available, use threadpool async def get_document_transcript(filepath: str) -> dict[int, str]: """Extracts text of a pdf document. @@ -47,7 +45,7 @@ def _extract_pdf_text(filepath: str) -> dict[int, str]: return await loop.run_in_executor(None, _extract_pdf_text, filepath) -def save_output( +async def save_output( dialogues: dict, document_name: str, output_base_path: str = "output" ) -> None: """Save dialogues to JSON files in the specified output path. @@ -124,11 +122,12 @@ async def get_transcript_from_link(link: str) -> dict[int, str]: with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as file: temp_file_path = file.name - async with httpx.AsyncClient(headers=headers) as client: + # TODO: add some checks to check correct file is being downloaded + async with httpx.AsyncClient(headers=headers, follow_redirects=True) as client: response = await client.get(url=link, timeout=30) response.raise_for_status() - with aiofiles.open(temp_file_path, "wb") as file: + async with aiofiles.open(temp_file_path, "wb") as file: async for chunk in response.aiter_bytes(chunk_size=8192): await file.write(chunk) diff --git a/tests/perf/time_logger.py b/tests/perf/time_logger.py new file mode 100644 index 0000000..248e4df --- /dev/null +++ b/tests/perf/time_logger.py @@ -0,0 +1,157 @@ +import asyncio +import os +import time +from dataclasses import dataclass + +from concall_parser.log_config import logger +from concall_parser.utils import file_utils + + +@dataclass +class BenchmarkResult: + """Structured benchmark result.""" + + func_name: str + duration: float + mode: str # 'sync' or 'async' + input_identifier: str # identifier for the specific input used + arg_count: int + kwarg_count: int + success: bool + error_message: str | None = None + timestamp: float = None + + def __post_init__(self): + """Override post init for benchmark results.""" + if self.timestamp is None: + self.timestamp = time.time() + + +class Benchmark: + """Benchmark function performance with respect to time for measuring improvments.""" + + def __init__(self): + self.results = [] + + def time_sync(self, func, *args, **kwargs): + """Time a synchronous function call.""" + start_time = time.perf_counter() + exec_result = func(*args, **kwargs) + end_time = time.perf_counter() + duration = end_time - start_time + result = { + "func": func.__name__, + "duration": duration, + "mode": "sync", + "arg_count": len(args), + "kwarg_count": len(kwargs), + } + + logger.info(f"Executed sync func {func.__name__}", extra=result) + self.results.append(result) + return exec_result + + async def time_async(self, func, *args, **kwargs): + """Time an asynchronous function.""" + start_time = time.perf_counter() + exec_result = await func(*args, **kwargs) + end_time = time.perf_counter() + duration = end_time - start_time + result = { + "func": func.__name__, + "duration": duration, + "mode": "async", + "arg_count": len(args), + "kwarg_count": len(kwargs), + } + logger.info(f"Executed async func {func.__name__}", extra=result) + self.results.append(result) + return exec_result + + def print_results(self): + """Prints benchmark results.""" + print("\n" + "=" * 60) + print("BENCHMARK RESULTS") + print("=" * 60) + + for result in self.results: + print(f"Function: {result['func']}") + print(f"Duration: {result['duration']:.4f} seconds") + print(f"Args: {result['arg_count']}, Kwargs: {result['kwarg_count']}") + print("-" * 40) + + def get_average_time(self, function_name: str) -> float: + """Get average time for a specific function.""" + times = [r["duration"] for r in self.results if r["function"] == function_name] + return sum(times) / len(times) if times else 0 + + async def run_batch_benchmark(self, func, times: int, *args, **kwargs): + """Run the benchmark multiple times to check average performance.""" + if not times: + times = 5 + durations = [] + for _ in range(times): + start_time = time.perf_counter() + if hasattr(func, "__await__"): + result = await func(*args, **kwargs) + else: + result = func(*args, **kwargs) + end_time = time.perf_counter() + durations.append(end_time - start_time) + avg_duration = sum(durations) / len(durations) if durations else 0 + logger.info( + f"Batch benchmark for {func.__name__}: avg {avg_duration:.4f}s over {times} runs" + ) + return avg_duration + + +async def run_file_utils_perf(): + """Benchmarking for file util functions defined in concall_parser/utils/file_utils.py.""" + benchmark = Benchmark() + test_documents_dir = "tests/test_documents" + test_links = [ + "https://www.adanigas.com/-/media/Project/AdaniGas/Investors/Financials/Earnings-Call-Transcript-and--Recordings/AdaniTotalGas-Earnings-Q3FY25.pdf", + "https://www.indusind.com/content/dam/indusind-corporate/investors/QuarterFinancialResults/FY2024-2025/Quarter4/IndusInd-Bank-Analyst-Call-Q4FY25-20250521.pdf", + "https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=a809b7d3-ca44-4410-acf7-af064786fe5a.pdf", + ] + test_files = [ + os.path.join(test_documents_dir, f) + for f in os.listdir(test_documents_dir) + if f.endswith(".pdf") + ] + + transcripts = [] + + # Test get_doc_transcript (path) + for file_path in test_files: + try: + transcript = await benchmark.time_async( + file_utils.get_document_transcript, file_path + ) + transcripts.append(transcript) + except Exception: + logger.exception(f"Error in get_document_transcript for {file_path}") + + # Test get_transcript_from_link + for link in test_links: + try: + transcript = await benchmark.time_async(file_utils.get_transcript_from_link, link) + transcripts.append(transcript) + except Exception: + logger.error(f"Error in get_transcript_from_link for {link}") + + # TODO: benchmarking for save_output + + # Test save_transcript (async) + for i, transcript in enumerate(transcripts): + try: + await benchmark.time_async( + file_utils.save_transcript, transcript, "sample.pdf", "test_perf" + ) + except Exception as e: + logger.error(f"Error in save_transcript for {test_files[i]}: {e}") + + benchmark.print_results() + + +asyncio.run(run_file_utils_perf())