From 7108076cfddd33e858edde95c879cb6804805f4e Mon Sep 17 00:00:00 2001 From: mattkiefer Date: Sat, 10 Jun 2023 13:51:18 -0500 Subject: [PATCH 1/2] starting a crawler feature --- .gitignore | 2 ++ crawler.py | 13 +++++++++++++ 2 files changed, 15 insertions(+) create mode 100644 .gitignore create mode 100644 crawler.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0d017ea --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +# created by virtualenv automatically + diff --git a/crawler.py b/crawler.py new file mode 100644 index 0000000..3e35a67 --- /dev/null +++ b/crawler.py @@ -0,0 +1,13 @@ +import requests + +def crawl_page(url,file_exts=None,keywords=None): + """ + given a web page url - + and optional list of file exts and keywords - + crawl page for matching files + and return download urls + """ + # get page + page = requests.get(url) + + From 212e52509967bbd5e375dd6d89e53d1fc425397c Mon Sep 17 00:00:00 2001 From: mattkiefer Date: Sat, 10 Jun 2023 15:57:32 -0500 Subject: [PATCH 2/2] semi-functional scraper that pulls pdfs from a site -- specifically config'd for a DOJ page --- app.py | 34 +++++++++++++++++++++++++++++++ crawler.py | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 89 insertions(+), 4 deletions(-) diff --git a/app.py b/app.py index 7021c1c..07491dd 100644 --- a/app.py +++ b/app.py @@ -11,6 +11,8 @@ from audio_utils import convert_audio_to_text from file_knowledge import FileKnowledge +from crawler import crawl_page + def main(): load_dotenv() st.set_page_config(page_title="Journalist's Toolbox", page_icon=":smiley:") @@ -68,6 +70,7 @@ def initialize_sidebar(session): with st.expander("Upload files"): process_files("pdf", get_splitter(), session) process_files("m4a", get_splitter(), session) + crawl_site(get_splitter(),session) st.header("Journalist toolbox") st.write("Upload your PDF file or audio file") @@ -95,6 +98,37 @@ def process_files(file_type, splitter, session): add_document_to_vector_store(file_knowledge) +def crawl_site(splitter,session): + """ + user enters a url + crawl_page gets files linked from that page + and returns them to st session + """ + # user input + url = st.text_input("scrape a url :articulated_lorry:") + # try to scrape + page_data = crawl_page(url) + # check if we have data + if page_data: + # we may have multiple results + for datum in page_data: + # buffer file + bufferwb = open('/tmp/' + datum['name'],'wb') + bufferwb.write(datum['content']) + bufferwb.close() + buffer = open('/tmp/' + datum['name'],'rb') + # get file knowledge + try: + file_knowledge = FileKnowledge(name=datum['name'],file=buffer,filetype=datum['ext'],splitter=splitter) + # add to session info + session[datum['name']] = file_knowledge + # add to vector store + add_document_to_vector_store(file_knowledge) + + except Exception as e: + print(e) + import ipdb; ipdb.set_trace() + def get_vector_store(): if not hasattr(st.session_state, "vector_store"): raise ValueError("No vector store found") diff --git a/crawler.py b/crawler.py index 3e35a67..3821de3 100644 --- a/crawler.py +++ b/crawler.py @@ -1,4 +1,11 @@ import requests +import streamlit as st +from bs4 import BeautifulSoup + +# we can only support certain filetypes rn +crawlable_exts = ['pdf','download'] +# save $$$ +limit = 2 def crawl_page(url,file_exts=None,keywords=None): """ @@ -7,7 +14,51 @@ def crawl_page(url,file_exts=None,keywords=None): crawl page for matching files and return download urls """ - # get page - page = requests.get(url) - - + # keep track of files to download + get_list = [] + # return name, content, ext payload list + payload = [] + # no url will be given on page load + if url: + if 'http' not in url and 'https' not in url: + url = 'http://' + url + # get page + page = requests.get(url).content + # parse with beautiful soup + soup = BeautifulSoup(page,'html.parser') + # get links on page + links = soup.find_all('a') + # get pdf links + for link in links: + # check if ext is in accept list + for href_ext in crawlable_exts: + # check if this link contains href to acceptable ext + if 'href' in link.attrs and href_ext in link.attrs['href']: + # if so get it + get_list.append(link.attrs['href']) + + # keep track of how many files you've scraped + counter = 0 + # get the acceptable links we've collected + for link in get_list: + # file name + name = link.split('/')[-1] + # file ext + ext = name.split('.')[-1] + # hack TODO use .html or .txt if it's not a supported ext + if 'download' in link: + ext = 'pdf' + link = url + link + link = link.replace('capitol-breach-cases/usao-dc/','') + print('zyx',link) + # binary content + content = requests.get(link).content + # return as dict + payload.append({'name':name,'content':content,'ext':ext}) + # increment + counter += 1 + if counter == limit: + break + # zero, one or more file name, content exts + return payload[0:limit] # save $$$ +