Standardize pyproject.toml (uv support) and improve parsing concurrency #25

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft

pranshu-raj-211 wants to merge 5 commits into main from feat/async-requests

concall_parser/utils/file_utils.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -1,13 +1,16 @@
  
    import asyncio

    import json

    import os

    import tempfile

    import aiofiles

    import httpx

    import pdfplumber

    import requests

    from concall_parser.log_config import logger

    def get_document_transcript(filepath: str) -> dict[int, str]:

    async def get_document_transcript(filepath: str) -> dict[int, str]:

        """Extracts text of a pdf document.

        Args:

    @@ -16,24 +19,33 @@ def get_document_transcript(filepath: str) -> dict[int, str]:
  
        Returns:

            transcript: Dictionary of page number, page text pair.

        """

        transcript = {}

        try:

            with pdfplumber.open(filepath) as pdf:

                logger.debug("Loaded document")

                page_number = 1

                for page in pdf.pages:

                    text = page.extract_text()

                    if text:

                        transcript[page_number] = text

                        page_number += 1

            return transcript

        except FileNotFoundError:

            raise FileNotFoundError("Please check if file exists.")

        except Exception:

            logger.exception("Could not load file %s", filepath)

    def save_output(

        def _extract_pdf_text(filepath: str) -> dict[int, str]:

            transcript = {}

            try:

                with pdfplumber.open(filepath) as pdf:

                    logger.debug(f"Loaded document {filepath}")

                    # ? Do we need to start a counter? can we not do pdfplumber pages or enumerate?

                    page_number = 1

                    for page in pdf.pages:

                        text = page.extract_text()

                        if text:

                            transcript[page_number] = text

                            page_number += 1

                return transcript

            except FileNotFoundError:

                logger.exception(

                    f"Could not file with path {filepath}. Please check if it exists."

                )

                raise FileNotFoundError("Please check if file exists.")

            except Exception:

                logger.exception("Could not load file %s", filepath)

        loop = asyncio.get_event_loop()

        return await loop.run_in_executor(None, _extract_pdf_text, filepath)

    async def save_output(

        dialogues: dict, document_name: str, output_base_path: str = "output"

    ) -> None:

        """Save dialogues to JSON files in the specified output path.

    @@ -46,18 +58,20 @@ def save_output(
  
            output_base_path (str): Path to directory in which outputs are to be saved.

            document_name (str): Name of the file being parsed, corresponds to company name for now.

        """

        for dialogue_type, dialogue in dialogues.items():

        try:

            output_dir_path = os.path.join(

                output_base_path, os.path.basename(document_name)[:-4]

            )

            os.makedirs(output_dir_path, exist_ok=True)

            with open(

                os.path.join(output_dir_path, f"{dialogue_type}.json"), "w"

            ) as file:

                json.dump(dialogue, file, indent=4)

            for dialogue_type, dialogue in dialogues.items():

                output_file_path = os.path.join(output_dir_path, f"{dialogue_type}.json")

                async with aiofiles.open(output_file_path, "w") as file:

                    await file.write(json.dump(dialogue, indent=4))

        except Exception:

            logger.exception(f"Failed to save outputs for file {output_base_path}.")

    def save_transcript(

    async def save_transcript(

        transcript: dict,

        document_path: str,

        output_base_path: str = "raw_transcript",

    @@ -75,44 +89,55 @@ def save_transcript(
  
            document_name = os.path.basename(document_path)[:-4]  # remove the .pdf

            output_dir_path = os.path.join(output_base_path, document_name)

            os.makedirs(output_base_path, exist_ok=True)

            with open(f"{output_dir_path}.txt", "w") as file:

            # ? concatenate all transcript texts before writing at once? IO overhead?

            async with aiofiles.open(f"{output_dir_path}.txt", "w") as file:

                for _, text in transcript.items():

                    file.write(text)

                    file.write("\n\n")

                    await file.write(text)

                    await file.write("\n\n")

                # ? Do we gather all tasks before asynchronously executing?

            logger.info("Saved transcript text to file\n")

        except Exception:

            logger.exception("Could not save document transcript")

    def get_transcript_from_link(link:str) -> dict[int, str]:

    async def get_transcript_from_link(link: str) -> dict[int, str]:

        """Extracts transcript by downloading pdf from a given link.

        Args:

            link: Link to the pdf document of earnings call report.

        Returns:

            transcript: A page number-page text mapping.

        Raises:

            Http error, if encountered during downloading document.

        """

        try:

            # TODO: expand error handling - file operations

            logger.debug("Request to get transcript from link.")

            headers = {

                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"# noqa: E501

                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"  # noqa: E501

            }

            response = requests.get(url=link, headers=headers, timeout=30, stream=True)

            response.raise_for_status()

            with tempfile.NamedTemporaryFile(suffix='.pdf', delete=False) as file:

                temp_file_path = file.name

            # TODO: add some checks to check correct file is being downloaded

            async with httpx.AsyncClient(headers=headers, follow_redirects=True) as client:

                response = await client.get(url=link, timeout=30)

                response.raise_for_status()

            temp_doc_path = "temp_document.pdf"

            with open(temp_doc_path, 'wb') as temp_pdf:

                for chunk in response.iter_content(chunk_size=8192):

                    temp_pdf.write(chunk)

            transcript = get_document_transcript(filepath=temp_doc_path)

            os.remove(temp_doc_path)

                async with aiofiles.open(temp_file_path, "wb") as file:

                    async for chunk in response.aiter_bytes(chunk_size=8192):

                        await file.write(chunk)

            transcript = await get_document_transcript(filepath=temp_file_path)

            return transcript

        except Exception:

            logger.exception("Could not get transcript from link")

            return dict()
      
            return dict()

        finally:

            if os.path.exists(temp_file_path):

                os.remove(temp_file_path)

pyproject.toml

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -1,49 +1,44 @@
  
    [tool.poetry]

    [project]

    name = "concall-parser"

    version = "1.0.4"

    description = "A parser for extracting analyst discussion and management commentary efficiently from concalls."

    authors = [

        "Jay Shah <jayshah0726@gmail.com>",

        "Pranshu Raj <pranshuraj65536@gmail.com>"

    ]

    maintainers = [

        "Pranshu Raj <pranshuraj65536@gmail.com>"

        { name = "Jay Shah", email = "jayshah0726@gmail.com" },

        { name = "Pranshu Raj", email = "pranshuraj65536@gmail.com" },

    ]

    maintainers = [{ name = "Pranshu Raj", email = "pranshuraj65536@gmail.com" }]

    readme = "README.md"

    packages = [

        { include = "concall_parser", from = "." },

    requires-python = ">=3.10"

    dependencies = [

        "aiofiles>=24.1.0",

        "groq==0.22.0",

        "httpx>=0.28.1",

        "pdfplumber==0.11.5",

        "python-dotenv==1.1.0",

    ]

    [tool.poetry.dependencies]

    python = "^3.10"

    groq = "0.22.0"

    pdfplumber = "0.11.5"

    python-dotenv = "1.1.0"

    requests = "2.32.2"

    [tool.poetry.group.dev.dependencies]

    ruff = "0.4.1"

    pre-commit = "3.7.0"

    [project.optional-dependencies]

    dev = ["ruff==0.4.1", "pre-commit==3.7.0"]

    [build-system]

    requires = ["poetry-core"]

    build-backend = "poetry.core.masonry.api"

    requires = ["hatchling"]

    build-backend = "hatchling.build"

    [tool.hatch.build.targets.wheel]

    packages = ["concall_parser"]

    [tool.ruff]

    line-length = 80

    indent-width = 4

    target-version = "py310"

    extend-exclude = [

        "__init__.py",

        "migrations",

    ]

    extend-exclude = ["__init__.py", "migrations"]

    [tool.ruff.lint]

    extend-select = [

        "UP", # pyupgrade

        "E", # pycodestyle

        "I", # isort

        "D", # pydocstyle

        "E",  # pycodestyle

        "I",  # isort

        "D",  # pydocstyle

    ]

    ignore = [

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Standardize pyproject.toml (uv support) and improve parsing concurrency #25

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!

Standardize pyproject.toml (uv support) and improve parsing concurrency #25

Are you sure you want to change the base?

Uh oh!

Standardize pyproject.toml (uv support) and improve parsing concurrency #25

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Uh oh!