Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 160 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock

# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
8 changes: 8 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,11 @@ Harnessing cutting-edge artificial intelligence, QuickTrace accelerates the inve
providing journalists with lightning-fast access to critical information. Maximize efficiency, uncover the truth,
and elevate your investigations with QuickTrace—the trusted companion of every
journalist committed to impactful and comprehensive reporting.

## Connecting to Google Drive
Enable your Google Drive API by following the instructions [here](https://developers.google.com/drive/api/quickstart/python). Save the `credentials.json` file in the top level directory.

To download all files from your Google Drive account to be uploaded to QuickTrace,
run `python google_drive.py`.

To search for a specific filetype, use `python -c "from google_drive import search_filetype('filename.ext')"`
12 changes: 7 additions & 5 deletions app.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from pathlib import Path

from audio_utils import convert_audio_to_text
from file_knowledge import FileKnowledge
Expand Down Expand Up @@ -66,11 +67,11 @@ def initialize_sidebar(session):
with st.sidebar:
show_all_konwledge = st.button("Show all knowledge", key="show_all_konwledge")
with st.expander("Upload files"):
process_files("pdf", get_splitter(), session)
process_files("m4a", get_splitter(), session)
process_files(get_splitter(), session)

st.header("Journalist toolbox")
st.write("Upload your PDF file or audio file")
st.write("Upload your PDF, audio, text, or csv files")
st.write("Then ask a question and get an answer")
st.write("You can also download the text of the uploaded files")
st.divider()
Expand All @@ -86,9 +87,10 @@ def get_splitter():
length_function=len,
)

def process_files(file_type, splitter, session):
files = st.file_uploader(f"Upload your {file_type} file", type=[file_type], accept_multiple_files=True)
def process_files(splitter, session):
files = st.file_uploader(f"Upload your files!", accept_multiple_files=True)
for file in files:
file_type = Path(file.name).suffix.split('.')[1]
if file.name not in st.session_state["knowledge"].keys():
file_knowledge = FileKnowledge(name=file.name, file=file, filetype=file_type, splitter=splitter)
session[file.name] = file_knowledge
Expand Down
18 changes: 16 additions & 2 deletions file_knowledge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
import tempfile
from dataclasses import dataclass, field
from typing import Any, List, TypeVar
from io import StringIO

from langchain.text_splitter import CharacterTextSplitter
from PyPDF2 import PdfReader

from audio_utils import convert_audio_to_text

UploadedFile = TypeVar('UploadedFile', bound=Any)
SUPPORTED_FILE_TYPES = ['pdf', 'csv', 'txt', 'html', 'm4a', 'eml', 'msg', 'mbox']


@dataclass
Expand All @@ -23,6 +25,7 @@ class FileKnowledge:
def __post_init__(self):
self.content = self.extract_text()
self.chunks = self.splitter.split_text(self.content)


@property
def content(self):
Expand All @@ -42,6 +45,7 @@ def chunks(self, value):
self._chunks = value
self.save_to_session_state()


def save_to_session_state(self):
st.session_state.knowledge[self.name] = self

Expand All @@ -50,8 +54,14 @@ def extract_text(self):
return self.extract_text_from_pdf()
elif self.filetype == 'm4a':
return self.extract_text_from_audio()
elif self.filetype == 'txt':
return self.extract_text_generic()
elif self.filetype == 'csv':
return self.extract_text_generic()

else:
raise ValueError(f'Unsupported filetype: {self.filetype}')
if not self.filetype in SUPPORTED_FILE_TYPES:
raise ValueError(f'Unsupported filetype: {self.filetype}')

def extract_text_from_pdf(self):
# Add your code here to extract text from a PDF file
Expand All @@ -60,7 +70,11 @@ def extract_text_from_pdf(self):
for page in pdf_reader.pages:
text += page.extract_text()
return text


def extract_text_generic(self):
stringio = StringIO(self.file.getvalue().decode("utf-8"))
return stringio.read()

def extract_text_from_audio(self):
with tempfile.NamedTemporaryFile(delete=False, suffix=".m4a") as tmp:
tmp.write(self.file.read())
Expand Down
Loading