Add support for csv and txt files #5

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

dcjohnson24 wants to merge 4 commits into QuickTrace:main from dcjohnson24:add-new-filetype-support

.gitignore

-Original file line number
+Diff line change
@@ -0,0 +1,160 @@
+    # Byte-compiled / optimized / DLL files
+    __pycache__/
+    *.py[cod]
+    *$py.class
+    # C extensions
+    *.so
+    # Distribution / packaging
+    .Python
+    build/
+    develop-eggs/
+    dist/
+    downloads/
+    eggs/
+    .eggs/
+    lib/
+    lib64/
+    parts/
+    sdist/
+    var/
+    wheels/
+    share/python-wheels/
+    *.egg-info/
+    .installed.cfg
+    *.egg
+    MANIFEST
+    # PyInstaller
+    #  Usually these files are written by a python script from a template
+    #  before PyInstaller builds the exe, so as to inject date/other infos into it.
+    *.manifest
+    *.spec
+    # Installer logs
+    pip-log.txt
+    pip-delete-this-directory.txt
+    # Unit test / coverage reports
+    htmlcov/
+    .tox/
+    .nox/
+    .coverage
+    .coverage.*
+    .cache
+    nosetests.xml
+    coverage.xml
+    *.cover
+    *.py,cover
+    .hypothesis/
+    .pytest_cache/
+    cover/
+    # Translations
+    *.mo
+    *.pot
+    # Django stuff:
+    *.log
+    local_settings.py
+    db.sqlite3
+    db.sqlite3-journal
+    # Flask stuff:
+    instance/
+    .webassets-cache
+    # Scrapy stuff:
+    .scrapy
+    # Sphinx documentation
+    docs/_build/
+    # PyBuilder
+    .pybuilder/
+    target/
+    # Jupyter Notebook
+    .ipynb_checkpoints
+    # IPython
+    profile_default/
+    ipython_config.py
+    # pyenv
+    #   For a library or package, you might want to ignore these files since the code is
+    #   intended to run in multiple environments; otherwise, check them in:
+    # .python-version
+    # pipenv
+    #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+    #   However, in case of collaboration, if having platform-specific dependencies or dependencies
+    #   having no cross-platform support, pipenv may install dependencies that don't work, or not
+    #   install all needed dependencies.
+    #Pipfile.lock
+    # poetry
+    #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+    #   This is especially recommended for binary packages to ensure reproducibility, and is more
+    #   commonly ignored for libraries.
+    #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+    #poetry.lock
+    # pdm
+    #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+    #pdm.lock
+    #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+    #   in version control.
+    #   https://pdm.fming.dev/#use-with-ide
+    .pdm.toml
+    # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+    __pypackages__/
+    # Celery stuff
+    celerybeat-schedule
+    celerybeat.pid
+    # SageMath parsed files
+    *.sage.py
+    # Environments
+    .env
+    .venv
+    env/
+    venv/
+    ENV/
+    env.bak/
+    venv.bak/
+    # Spyder project settings
+    .spyderproject
+    .spyproject
+    # Rope project settings
+    .ropeproject
+    # mkdocs documentation
+    /site
+    # mypy
+    .mypy_cache/
+    .dmypy.json
+    dmypy.json
+    # Pyre type checker
+    .pyre/
+    # pytype static type analyzer
+    .pytype/
+    # Cython debug symbols
+    cython_debug/
+    # PyCharm
+    #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+    #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+    #  and can be added to the global gitignore or merged into this file.  For a more nuclear
+    #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+    #.idea/

README.md

-Original file line number
+Diff line change
@@ Expand Up @@
     providing journalists with lightning-fast access to critical information. Maximize efficiency, uncover the truth,
     and elevate your investigations with QuickTrace—the trusted companion of every
     journalist committed to impactful and comprehensive reporting.
+    ## Connecting to Google Drive
+    Enable your Google Drive API by following the instructions [here](https://developers.google.com/drive/api/quickstart/python). Save the `credentials.json` file in the top level directory.
+    To download all files from your Google Drive account to be uploaded to QuickTrace,
+    run `python google_drive.py`.
+    To search for a specific filetype, use `python -c "from google_drive import search_filetype('filename.ext')"`

app.py

            
                      Original file line number
                      Diff line number
                      Diff line change
                  
    @@ -7,6 +7,7 @@
  
    from langchain.chat_models import ChatOpenAI

    from langchain.text_splitter import CharacterTextSplitter

    from langchain.vectorstores import FAISS

    from pathlib import Path

    from audio_utils import convert_audio_to_text

    from file_knowledge import FileKnowledge

    @@ -66,11 +67,11 @@ def initialize_sidebar(session):
  
        with st.sidebar:

            show_all_konwledge = st.button("Show all knowledge", key="show_all_konwledge")

            with st.expander("Upload files"):

                process_files("pdf", get_splitter(), session)

                process_files("m4a", get_splitter(), session)

                process_files(get_splitter(), session)

            st.header("Journalist toolbox")

            st.write("Upload your PDF file or audio file")

            st.write("Upload your PDF, audio, text, or csv files")

            st.write("Then ask a question and get an answer")

            st.write("You can also download the text of the uploaded files")

            st.divider()

    @@ -86,9 +87,10 @@ def get_splitter():
  
            length_function=len,

        )

    def process_files(file_type, splitter, session):

        files = st.file_uploader(f"Upload your {file_type} file", type=[file_type], accept_multiple_files=True)

    def process_files(splitter, session):

        files = st.file_uploader(f"Upload your files!", accept_multiple_files=True)

        for file in files:

            file_type = Path(file.name).suffix.split('.')[1]

            if file.name not in st.session_state["knowledge"].keys():

                file_knowledge = FileKnowledge(name=file.name, file=file, filetype=file_type, splitter=splitter)

                session[file.name] = file_knowledge

file_knowledge.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -2,13 +2,15 @@ @@
     import tempfile
     from dataclasses import dataclass, field
     from typing import Any, List, TypeVar
+    from io import StringIO
     from langchain.text_splitter import CharacterTextSplitter
     from PyPDF2 import PdfReader
     from audio_utils import convert_audio_to_text
     UploadedFile = TypeVar('UploadedFile', bound=Any)
+    SUPPORTED_FILE_TYPES = ['pdf', 'csv', 'txt', 'html', 'm4a', 'eml', 'msg', 'mbox']
     @dataclass
@@ Expand All / @@ -23,6 +25,7 @@ class FileKnowledge: @@
         def __post_init__(self):
             self.content = self.extract_text()
             self.chunks = self.splitter.split_text(self.content)
         @property
         def content(self):
@@ Expand All / @@ -42,6 +45,7 @@ def chunks(self, value): @@
             self._chunks = value
             self.save_to_session_state()
         def save_to_session_state(self):
             st.session_state.knowledge[self.name] = self
@@ Expand All / @@ -50,8 +54,14 @@ def extract_text(self): @@
                 return self.extract_text_from_pdf()
             elif self.filetype == 'm4a':
                 return self.extract_text_from_audio()
+            elif self.filetype == 'txt':
+                return self.extract_text_generic()
+            elif self.filetype == 'csv':
+                return self.extract_text_generic()
             else:
-                raise ValueError(f'Unsupported filetype: {self.filetype}')
+                if not self.filetype in SUPPORTED_FILE_TYPES:
+                    raise ValueError(f'Unsupported filetype: {self.filetype}')
         def extract_text_from_pdf(self):
             # Add your code here to extract text from a PDF file
@@ Expand All / @@ -60,7 +70,11 @@ def extract_text_from_pdf(self): @@
             for page in pdf_reader.pages:
                 text += page.extract_text()
             return text
+        def extract_text_generic(self):
+            stringio = StringIO(self.file.getvalue().decode("utf-8"))
+            return stringio.read()
         def extract_text_from_audio(self):
             with tempfile.NamedTemporaryFile(delete=False, suffix=".m4a") as tmp:
                 tmp.write(self.file.read())
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Add support for csv and txt files #5

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!

Add support for csv and txt files #5

Are you sure you want to change the base?

Uh oh!

Add support for csv and txt files #5

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing

Uh oh!