Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
109 changes: 67 additions & 42 deletions concall_parser/utils/file_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import asyncio
import json
import os
import tempfile

import aiofiles
import httpx
import pdfplumber
import requests

from concall_parser.log_config import logger


def get_document_transcript(filepath: str) -> dict[int, str]:
async def get_document_transcript(filepath: str) -> dict[int, str]:
"""Extracts text of a pdf document.

Args:
Expand All @@ -16,24 +19,33 @@ def get_document_transcript(filepath: str) -> dict[int, str]:
Returns:
transcript: Dictionary of page number, page text pair.
"""
transcript = {}
try:
with pdfplumber.open(filepath) as pdf:
logger.debug("Loaded document")
page_number = 1
for page in pdf.pages:
text = page.extract_text()
if text:
transcript[page_number] = text
page_number += 1
return transcript
except FileNotFoundError:
raise FileNotFoundError("Please check if file exists.")
except Exception:
logger.exception("Could not load file %s", filepath)


def save_output(
def _extract_pdf_text(filepath: str) -> dict[int, str]:
transcript = {}
try:
with pdfplumber.open(filepath) as pdf:
logger.debug(f"Loaded document {filepath}")
# ? Do we need to start a counter? can we not do pdfplumber pages or enumerate?
page_number = 1
for page in pdf.pages:
text = page.extract_text()
if text:
transcript[page_number] = text
page_number += 1
return transcript
except FileNotFoundError:
logger.exception(
f"Could not file with path {filepath}. Please check if it exists."
)
raise FileNotFoundError("Please check if file exists.")
except Exception:
logger.exception("Could not load file %s", filepath)

loop = asyncio.get_event_loop()
return await loop.run_in_executor(None, _extract_pdf_text, filepath)


async def save_output(
dialogues: dict, document_name: str, output_base_path: str = "output"
) -> None:
"""Save dialogues to JSON files in the specified output path.
Expand All @@ -46,18 +58,20 @@ def save_output(
output_base_path (str): Path to directory in which outputs are to be saved.
document_name (str): Name of the file being parsed, corresponds to company name for now.
"""
for dialogue_type, dialogue in dialogues.items():
try:
output_dir_path = os.path.join(
output_base_path, os.path.basename(document_name)[:-4]
)
os.makedirs(output_dir_path, exist_ok=True)
with open(
os.path.join(output_dir_path, f"{dialogue_type}.json"), "w"
) as file:
json.dump(dialogue, file, indent=4)
for dialogue_type, dialogue in dialogues.items():
output_file_path = os.path.join(output_dir_path, f"{dialogue_type}.json")
async with aiofiles.open(output_file_path, "w") as file:
await file.write(json.dump(dialogue, indent=4))
except Exception:
logger.exception(f"Failed to save outputs for file {output_base_path}.")


def save_transcript(
async def save_transcript(
transcript: dict,
document_path: str,
output_base_path: str = "raw_transcript",
Expand All @@ -75,44 +89,55 @@ def save_transcript(
document_name = os.path.basename(document_path)[:-4] # remove the .pdf
output_dir_path = os.path.join(output_base_path, document_name)
os.makedirs(output_base_path, exist_ok=True)
with open(f"{output_dir_path}.txt", "w") as file:
# ? concatenate all transcript texts before writing at once? IO overhead?
async with aiofiles.open(f"{output_dir_path}.txt", "w") as file:
for _, text in transcript.items():
file.write(text)
file.write("\n\n")
await file.write(text)
await file.write("\n\n")
# ? Do we gather all tasks before asynchronously executing?
logger.info("Saved transcript text to file\n")
except Exception:
logger.exception("Could not save document transcript")


def get_transcript_from_link(link:str) -> dict[int, str]:
async def get_transcript_from_link(link: str) -> dict[int, str]:
"""Extracts transcript by downloading pdf from a given link.

Args:
link: Link to the pdf document of earnings call report.

Returns:
transcript: A page number-page text mapping.

Raises:
Http error, if encountered during downloading document.
"""
try:
# TODO: expand error handling - file operations
logger.debug("Request to get transcript from link.")

headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"# noqa: E501
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" # noqa: E501
}
response = requests.get(url=link, headers=headers, timeout=30, stream=True)
response.raise_for_status()
with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as file:
temp_file_path = file.name

# TODO: add some checks to check correct file is being downloaded
async with httpx.AsyncClient(headers=headers, follow_redirects=True) as client:
response = await client.get(url=link, timeout=30)
response.raise_for_status()

temp_doc_path = "temp_document.pdf"
with open(temp_doc_path, 'wb') as temp_pdf:
for chunk in response.iter_content(chunk_size=8192):
temp_pdf.write(chunk)
transcript = get_document_transcript(filepath=temp_doc_path)
os.remove(temp_doc_path)
async with aiofiles.open(temp_file_path, "wb") as file:
async for chunk in response.aiter_bytes(chunk_size=8192):
await file.write(chunk)

transcript = await get_document_transcript(filepath=temp_file_path)
return transcript

except Exception:
logger.exception("Could not get transcript from link")
return dict()
return dict()

finally:
if os.path.exists(temp_file_path):
os.remove(temp_file_path)
49 changes: 22 additions & 27 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,49 +1,44 @@
[tool.poetry]
[project]
name = "concall-parser"
version = "1.0.4"
description = "A parser for extracting analyst discussion and management commentary efficiently from concalls."
authors = [
"Jay Shah <jayshah0726@gmail.com>",
"Pranshu Raj <pranshuraj65536@gmail.com>"
]
maintainers = [
"Pranshu Raj <pranshuraj65536@gmail.com>"
{ name = "Jay Shah", email = "jayshah0726@gmail.com" },
{ name = "Pranshu Raj", email = "pranshuraj65536@gmail.com" },
]
maintainers = [{ name = "Pranshu Raj", email = "pranshuraj65536@gmail.com" }]
readme = "README.md"
packages = [
{ include = "concall_parser", from = "." },
requires-python = ">=3.10"
dependencies = [
"aiofiles>=24.1.0",
"groq==0.22.0",
"httpx>=0.28.1",
"pdfplumber==0.11.5",
"python-dotenv==1.1.0",
]

[tool.poetry.dependencies]
python = "^3.10"
groq = "0.22.0"
pdfplumber = "0.11.5"
python-dotenv = "1.1.0"
requests = "2.32.2"

[tool.poetry.group.dev.dependencies]
ruff = "0.4.1"
pre-commit = "3.7.0"
[project.optional-dependencies]
dev = ["ruff==0.4.1", "pre-commit==3.7.0"]

[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"
requires = ["hatchling"]
build-backend = "hatchling.build"

[tool.hatch.build.targets.wheel]
packages = ["concall_parser"]

[tool.ruff]
line-length = 80
indent-width = 4
target-version = "py310"
extend-exclude = [
"__init__.py",
"migrations",
]
extend-exclude = ["__init__.py", "migrations"]

[tool.ruff.lint]
extend-select = [
"UP", # pyupgrade
"E", # pycodestyle
"I", # isort
"D", # pydocstyle
"E", # pycodestyle
"I", # isort
"D", # pydocstyle
]

ignore = [
Expand Down
Loading
Loading