From 7108076cfddd33e858edde95c879cb6804805f4e Mon Sep 17 00:00:00 2001
From: mattkiefer <matthewlkiefer@gmail.com>
Date: Sat, 10 Jun 2023 13:51:18 -0500
Subject: [PATCH 1/2] starting a crawler feature

---
 .gitignore |  2 ++
 crawler.py | 13 +++++++++++++
 2 files changed, 15 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 crawler.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..0d017ea
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+# created by virtualenv automatically
+
diff --git a/crawler.py b/crawler.py
new file mode 100644
index 0000000..3e35a67
--- /dev/null
+++ b/crawler.py
@@ -0,0 +1,13 @@
+import requests
+
+def crawl_page(url,file_exts=None,keywords=None):
+    """
+    given a web page url -
+    and optional list of file exts and keywords -
+    crawl page for matching files
+    and return download urls
+    """
+    # get page
+    page = requests.get(url)
+
+    

From 212e52509967bbd5e375dd6d89e53d1fc425397c Mon Sep 17 00:00:00 2001
From: mattkiefer <matthewlkiefer@gmail.com>
Date: Sat, 10 Jun 2023 15:57:32 -0500
Subject: [PATCH 2/2] semi-functional scraper that pulls pdfs from a site --
 specifically config'd for a DOJ page

---
 app.py     | 34 +++++++++++++++++++++++++++++++
 crawler.py | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 89 insertions(+), 4 deletions(-)

diff --git a/app.py b/app.py
index 7021c1c..07491dd 100644
--- a/app.py
+++ b/app.py
@@ -11,6 +11,8 @@
 from audio_utils import convert_audio_to_text
 from file_knowledge import FileKnowledge
 
+from crawler import crawl_page
+
 def main():
     load_dotenv()
     st.set_page_config(page_title="Journalist's Toolbox", page_icon=":smiley:")
@@ -68,6 +70,7 @@ def initialize_sidebar(session):
         with st.expander("Upload files"):
             process_files("pdf", get_splitter(), session)
             process_files("m4a", get_splitter(), session)
+            crawl_site(get_splitter(),session)
 
         st.header("Journalist toolbox")
         st.write("Upload your PDF file or audio file")
@@ -95,6 +98,37 @@ def process_files(file_type, splitter, session):
 
             add_document_to_vector_store(file_knowledge)
 
+def crawl_site(splitter,session):
+    """
+    user enters a url
+    crawl_page gets files linked from that page
+    and returns them to st session
+    """
+    # user input
+    url = st.text_input("scrape a url :articulated_lorry:")
+    # try to scrape
+    page_data = crawl_page(url) 
+    # check if we have data
+    if page_data:
+        # we may have multiple results
+        for datum in page_data:
+            # buffer file
+            bufferwb = open('/tmp/' + datum['name'],'wb')
+            bufferwb.write(datum['content'])
+            bufferwb.close()
+            buffer = open('/tmp/' + datum['name'],'rb')
+            # get file knowledge
+            try:
+                file_knowledge = FileKnowledge(name=datum['name'],file=buffer,filetype=datum['ext'],splitter=splitter)
+                # add to session info
+                session[datum['name']] = file_knowledge
+                # add to vector store
+                add_document_to_vector_store(file_knowledge)
+
+            except Exception as e:
+                print(e)
+                import ipdb; ipdb.set_trace()
+
 def get_vector_store():
     if not hasattr(st.session_state, "vector_store"):
         raise ValueError("No vector store found")
diff --git a/crawler.py b/crawler.py
index 3e35a67..3821de3 100644
--- a/crawler.py
+++ b/crawler.py
@@ -1,4 +1,11 @@
 import requests
+import streamlit as st
+from bs4 import BeautifulSoup
+
+# we can only support certain filetypes rn
+crawlable_exts = ['pdf','download']
+# save $$$
+limit = 2
 
 def crawl_page(url,file_exts=None,keywords=None):
     """
@@ -7,7 +14,51 @@ def crawl_page(url,file_exts=None,keywords=None):
     crawl page for matching files
     and return download urls
     """
-    # get page
-    page = requests.get(url)
-
-    
+    # keep track of files to download
+    get_list = []
+    # return name, content, ext payload list
+    payload = []
+    # no url will be given on page load
+    if url:
+        if 'http' not in url and 'https' not in url:
+            url = 'http://' + url
+        # get page
+        page = requests.get(url).content
+        # parse with beautiful soup
+        soup = BeautifulSoup(page,'html.parser')
+        # get links on page
+        links = soup.find_all('a')
+        # get pdf links
+        for link in links:
+            # check if ext is in accept list
+            for href_ext in crawlable_exts:
+                # check if this link contains href to acceptable ext
+                if 'href' in link.attrs and href_ext in link.attrs['href']:
+                    # if so get it
+                    get_list.append(link.attrs['href'])
+       
+        # keep track of how many files you've scraped
+        counter = 0
+        # get the acceptable links we've collected
+        for link in get_list:
+            # file name
+            name = link.split('/')[-1]
+            # file ext
+            ext = name.split('.')[-1]
+            # hack TODO use .html or .txt if it's not a supported ext
+            if 'download' in link:
+                ext = 'pdf'
+                link = url + link
+                link = link.replace('capitol-breach-cases/usao-dc/','')
+                print('zyx',link)
+            # binary content
+            content = requests.get(link).content
+            # return as dict
+            payload.append({'name':name,'content':content,'ext':ext})
+            # increment
+            counter += 1
+            if counter == limit:
+                break
+        # zero, one or more file name, content exts
+        return payload[0:limit] # save $$$
+