From bd10294d8b79ff78590b5707c7632ecacd5042d8 Mon Sep 17 00:00:00 2001 From: Zetaphor Date: Wed, 24 Apr 2024 18:18:23 -0500 Subject: [PATCH 01/11] Add requirments.txt --- requirements.txt | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 requirements.txt diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..5a070d7 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +flask +openai +beautifulsoup4 +python-dotenv \ No newline at end of file From b3fb2876e7884046268624873965b8e6dff96bd4 Mon Sep 17 00:00:00 2001 From: Zetaphor Date: Wed, 24 Apr 2024 21:15:35 -0500 Subject: [PATCH 02/11] Load URL and API key from env file --- .env | 2 ++ ReaperEngine.py | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 .env diff --git a/.env b/.env new file mode 100644 index 0000000..e022060 --- /dev/null +++ b/.env @@ -0,0 +1,2 @@ +BASE_URL="http://localhost:1234/v1/" +API_KEY="Dead Internet" \ No newline at end of file diff --git a/ReaperEngine.py b/ReaperEngine.py index f11abb8..d68cae6 100644 --- a/ReaperEngine.py +++ b/ReaperEngine.py @@ -1,15 +1,19 @@ +import os import json from openai import OpenAI from bs4 import BeautifulSoup +from dotenv import load_dotenv ''' About the name... I apologise for it sounding pretentious or whatever, but I dont care it sounds cool and cyberpunk-y(-ish) and fits with the Dead Internet Theory theme of this little project ''' +load_dotenv() + class ReaperEngine: def __init__(self): - self.client = OpenAI(base_url="http://localhost:11434/v1/", api_key="Dead Internet") # Ollama is pretty cool + self.client = OpenAI(base_url=os.getenv("BASE_URL"), api_key=os.getenv("API_KEY")) # Ollama is pretty cool self.internet_db = dict() # TODO: Exporting this sounds like a good idea, losing all your pages when you kill the script kinda sucks ngl, also loading it is a thing too self.temperature = 2.1 # Crank up for goofier webpages (but probably less functional javascript) From 7a7a34f36551d426e661a1b37afa0676e9506703 Mon Sep 17 00:00:00 2001 From: Zetaphor Date: Wed, 24 Apr 2024 21:16:15 -0500 Subject: [PATCH 03/11] Add gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6747cff --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +__pycache__ +internet.json +curpage.html \ No newline at end of file From 56557172347f5213ccbb51b24c6d49cdde86c27c Mon Sep 17 00:00:00 2001 From: Zetaphor Date: Wed, 24 Apr 2024 21:18:15 -0500 Subject: [PATCH 04/11] Update readme to include requirements.txt and env instructions --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index dc9ff58..a5c81d7 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Dead-Internet So we all know the classic [Dead Internet Theory](https://en.wikipedia.org/wiki/Dead_Internet_theory), and if you're reading this I assume you at least know what an LLM is. Need I say much more? Yeah of course! -This is a little project I threw together in a couple hours that lets you surf a completely fake web! You run a search query in the only non-generated page `/` and it generates a search results page with fake links that lead to fake websites that lead to more fake websites! +This is a little project I threw together in a couple hours that lets you surf a completely fake web! You run a search query in the only non-generated page `/` and it generates a search results page with fake links that lead to fake websites that lead to more fake websites! It's not perfect, not by a long shot, but it works well enough for me to spend like an hour just going through it and laughing at what it makes. If you encounter any issues with the search results page, reload and it'll generate a new page. If you get any issues with the other generated pages then try make slight adjustments to the URL to get a different page, right now there isn't yet a way to regenerate a page. @@ -16,6 +16,10 @@ Next you'll need to install Python if you don't already have it, I run Python 3. - [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/) - [Flask](https://pypi.org/project/Flask/) +You can install them by running `pip install -r requirements.txt` + +You can modify the API URL and API key in the `.env` file. + Once those are installed, simply run the main.py file and navigate to http://127.0.0.1:5000 or whatever URL Flask gives you and have fun! ## Inspiration From d3e50bf8a4689b81e1d733072ddf8ce0ce5cdb5b Mon Sep 17 00:00:00 2001 From: Zetaphor Date: Wed, 24 Apr 2024 22:09:37 -0500 Subject: [PATCH 05/11] Implement optional SearXNG support --- .env | 5 ++- README.md | 4 +++ ReaperEngine.py | 86 +++++++++++++++++++++++++++++++++++++++---------- 3 files changed, 77 insertions(+), 18 deletions(-) diff --git a/.env b/.env index e022060..47c2063 100644 --- a/.env +++ b/.env @@ -1,2 +1,5 @@ BASE_URL="http://localhost:1234/v1/" -API_KEY="Dead Internet" \ No newline at end of file +API_KEY="Dead Internet" +SEARXNG_URL="https://YOUR_SEARXNG_URL/search" +ENABLE_IMAGES=False +MAX_IMAGE_WIDTH=300 \ No newline at end of file diff --git a/README.md b/README.md index a5c81d7..abad079 100644 --- a/README.md +++ b/README.md @@ -22,5 +22,9 @@ You can modify the API URL and API key in the `.env` file. Once those are installed, simply run the main.py file and navigate to http://127.0.0.1:5000 or whatever URL Flask gives you and have fun! +## Image Support + +Optional image support is implemented using the [SearXNG](https://docs.searxng.org/) search engine. To enable it, set the `ENABLE_IMAGES` environment variable to `true` and provide a URL in the `SEARXNG_URL` environment variable to your SearXNG instance. This does require you to have the JSON format enabled in your `settings.yml`, which is not by default. + ## Inspiration I'll admit it, I'm not the most creative person. I got this idea from [this reddit comment on r/localllama](https://new.reddit.com/r/LocalLLaMA/comments/1c6ejb8/comment/l02eeqx/), so thank you very much commenter! diff --git a/ReaperEngine.py b/ReaperEngine.py index d68cae6..aa3e7fd 100644 --- a/ReaperEngine.py +++ b/ReaperEngine.py @@ -1,9 +1,12 @@ import os import json +import requests +import random from openai import OpenAI from bs4 import BeautifulSoup from dotenv import load_dotenv + ''' About the name... I apologise for it sounding pretentious or whatever, but I dont care it sounds cool and cyberpunk-y(-ish) and fits with the Dead Internet Theory theme of this little project @@ -18,31 +21,80 @@ def __init__(self): self.temperature = 2.1 # Crank up for goofier webpages (but probably less functional javascript) self.max_tokens = 4096 - self.system_prompt = "You are an expert in creating realistic webpages. You do not create sample pages, instead you create webpages that are completely realistic and look as if they really existed on the web. You do not respond with anything but HTML, starting your messages with and ending them with . If a requested page is not a HTML document, for example a CSS or Javascript file, write that language instead of writing any HTML. If the requested page is instead an image file or other non-text resource, attempt to generate an appropriate resource for it instead of writing any HTML. You use very little to no images at all in your HTML, CSS or JS." - - def _sanitize_links(self, dirty_html): + self.enable_images = bool(os.getenv("ENABLE_IMAGES")) + + self.system_prompt = "You are an expert in creating realistic webpages. You do not create sample pages, instead you create webpages that are completely realistic and look as if they really existed on the web. You do not respond with anything but HTML, starting your messages with and ending them with . If a requested page is not a HTML document, for example a CSS or Javascript file, write that language instead of writing any HTML." + + if self.enable_images: + self.system_prompt += " If the requested page is an image file, with an alt tag. Images should always have an alt tag. Images should always have a width attribute. If the requested page is instead an other non-text resource, attempt to generate an appropriate resource for it instead of writing any HTML." + else: + self.system_prompt += " If the requested page is instead an image file or other non-text resource, attempt to generate an appropriate resource for it instead of writing any HTML. You use very little to no images at all in your HTML, CSS or JS." + + def image_search(self, keyword): + # URL of the SearXNG API + url = os.getenv("SEARXNG_URL") + + params = { + 'q': keyword, + 'format': 'json', + 'categories': 'images' + } + + try: + response = requests.get(url, params=params) + response.raise_for_status() + data = response.json() + + if data['results']: + return data['results'][0]['img_src'] # Return the source URL of the first image + else: + return None + + except requests.RequestException as e: + print(f"Error fetching image: {e}") + return "https://via.placeholder.com/100" + + def _sanitize_html(self, dirty_html): # Teensy function to replace all links on the page so they link to the root of the server # Also to get rid of any http(s), this'll help make the link database more consistent - + soup = BeautifulSoup(dirty_html, "html.parser") + + # Replace any https references to keep the link database consistent for a in soup.find_all("a"): - print(a["href"]) - if "mailto:" in a["href"]: + href = a.get("href", "") + if "mailto:" in href: continue - a["href"] = a["href"].replace("http://", "") - a["href"] = a["href"].replace("https://", "") - a["href"] = "/" + a["href"] + clean_href = href.replace("http://", "").replace("https://", "") + a["href"] = "/" + clean_href + + # Update and adjust image tags + for img in soup.find_all("img"): + if "width" not in img.attrs: + # Assign a random width between 100 and 300px if width is not present + img["width"] = str(random.randint(100, 300)) + else: + # Ensure the width does not exceed the max width + if int(img["width"]) > int(os.getenv("MAX_IMAGE_WIDTH")): + img["width"] = os.getenv("MAX_IMAGE_WIDTH") + + alt_text = img.get("alt", "") + new_src = self.image_search(alt_text) + img["src"] = new_src + + return str(soup) + return str(soup) - + def get_index(self): # Super basic start page, just to get everything going return "

Enter the Dead Internet

" - + def get_page(self, url, path, query=None): # Return already generated page if already generated page try: return self.internet_db[url][path] except: pass - + # Construct the basic prompt prompt = f"Give me a classic geocities-style webpage from the fictional site of '{url}' at the resource path of '{path}'. Make sure all links generated either link to an external website, or if they link to another resource on the current website have the current url prepended ({url}) to them. For example if a link on the page has the href of 'help' or '/help', it should be replaced with '{url}/path'." # TODO: I wanna add all other pages to the prompt so the next pages generated resemble them, but since Llama 3 is only 8k context I hesitate to do so @@ -50,7 +102,7 @@ def get_page(self, url, path, query=None): # Add other pages to the prompt if they exist if url in self.internet_db and len(self.internet_db[url]) > 1: pass - + # Generate the page generated_page_completion = self.client.chat.completions.create(messages=[ { @@ -70,11 +122,11 @@ def get_page(self, url, path, query=None): generated_page = generated_page_completion.choices[0].message.content if not url in self.internet_db: self.internet_db[url] = dict() - self.internet_db[url][path] = self._sanitize_links(generated_page) + self.internet_db[url][path] = self._sanitize_html(generated_page) open("curpage.html", "w+").write(generated_page) - return self._sanitize_links(generated_page) - + return self._sanitize_html(generated_page) + def get_search(self, query): # Generates a cool little search page, this differs in literally every search and is not cached so be weary of losing links search_page_completion = self.client.chat.completions.create(messages=[ @@ -91,7 +143,7 @@ def get_search(self, query): max_tokens=self.max_tokens ) - return self._sanitize_links(search_page_completion.choices[0].message.content) + return self._sanitize_html(search_page_completion.choices[0].message.content) def export_internet(self, filename="internet.json"): json.dump(self.internet_db, open(filename, "w+")) From d371925d73612635f4e42ec0264f7c73edd61300 Mon Sep 17 00:00:00 2001 From: Zetaphor Date: Wed, 24 Apr 2024 22:27:33 -0500 Subject: [PATCH 06/11] Rename format function to support #2 --- .env | 5 ----- ReaperEngine.py | 8 ++++---- 2 files changed, 4 insertions(+), 9 deletions(-) delete mode 100644 .env diff --git a/.env b/.env deleted file mode 100644 index 47c2063..0000000 --- a/.env +++ /dev/null @@ -1,5 +0,0 @@ -BASE_URL="http://localhost:1234/v1/" -API_KEY="Dead Internet" -SEARXNG_URL="https://YOUR_SEARXNG_URL/search" -ENABLE_IMAGES=False -MAX_IMAGE_WIDTH=300 \ No newline at end of file diff --git a/ReaperEngine.py b/ReaperEngine.py index aa3e7fd..8d3f6f3 100644 --- a/ReaperEngine.py +++ b/ReaperEngine.py @@ -54,7 +54,7 @@ def image_search(self, keyword): print(f"Error fetching image: {e}") return "https://via.placeholder.com/100" - def _sanitize_html(self, dirty_html): + def _format_html(self, dirty_html): # Teensy function to replace all links on the page so they link to the root of the server # Also to get rid of any http(s), this'll help make the link database more consistent @@ -122,10 +122,10 @@ def get_page(self, url, path, query=None): generated_page = generated_page_completion.choices[0].message.content if not url in self.internet_db: self.internet_db[url] = dict() - self.internet_db[url][path] = self._sanitize_html(generated_page) + self.internet_db[url][path] = self._format_html(generated_page) open("curpage.html", "w+").write(generated_page) - return self._sanitize_html(generated_page) + return self._format_html(generated_page) def get_search(self, query): # Generates a cool little search page, this differs in literally every search and is not cached so be weary of losing links @@ -143,7 +143,7 @@ def get_search(self, query): max_tokens=self.max_tokens ) - return self._sanitize_html(search_page_completion.choices[0].message.content) + return self._format_html(search_page_completion.choices[0].message.content) def export_internet(self, filename="internet.json"): json.dump(self.internet_db, open(filename, "w+")) From cada124fa8cfbb05caefa14db37b67e1197dd8c7 Mon Sep 17 00:00:00 2001 From: Zetaphor Date: Wed, 24 Apr 2024 22:28:39 -0500 Subject: [PATCH 07/11] Actually do the thing I just said I would --- ReaperEngine.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ReaperEngine.py b/ReaperEngine.py index 8d3f6f3..4497575 100644 --- a/ReaperEngine.py +++ b/ReaperEngine.py @@ -54,7 +54,7 @@ def image_search(self, keyword): print(f"Error fetching image: {e}") return "https://via.placeholder.com/100" - def _format_html(self, dirty_html): + def _format_page(self, dirty_html): # Teensy function to replace all links on the page so they link to the root of the server # Also to get rid of any http(s), this'll help make the link database more consistent @@ -122,10 +122,10 @@ def get_page(self, url, path, query=None): generated_page = generated_page_completion.choices[0].message.content if not url in self.internet_db: self.internet_db[url] = dict() - self.internet_db[url][path] = self._format_html(generated_page) + self.internet_db[url][path] = self._format_page(generated_page) open("curpage.html", "w+").write(generated_page) - return self._format_html(generated_page) + return self._format_page(generated_page) def get_search(self, query): # Generates a cool little search page, this differs in literally every search and is not cached so be weary of losing links @@ -143,7 +143,7 @@ def get_search(self, query): max_tokens=self.max_tokens ) - return self._format_html(search_page_completion.choices[0].message.content) + return self._format_page(search_page_completion.choices[0].message.content) def export_internet(self, filename="internet.json"): json.dump(self.internet_db, open(filename, "w+")) From 084b2d552d79756231a6fff6ac0c00ef47b48c4b Mon Sep 17 00:00:00 2001 From: sebc <47074056+Sebby37@users.noreply.github.com> Date: Thu, 25 Apr 2024 13:14:12 +0930 Subject: [PATCH 08/11] Add gitignore and the fabled requirements.txt --- .gitignore | 4 ++-- requirements.txt | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 6747cff..ebecbe8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ -__pycache__ +__pycache__/ +curpage.html internet.json -curpage.html \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 5a070d7..22cc4b8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ flask openai beautifulsoup4 -python-dotenv \ No newline at end of file +python-dotenv From 943ca7ada6f657f19176f4ea28a41065d43a1ab3 Mon Sep 17 00:00:00 2001 From: Seb C <47074056+Sebby37@users.noreply.github.com> Date: Thu, 25 Apr 2024 14:06:15 +0930 Subject: [PATCH 09/11] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index abad079..6f29051 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,11 @@ If you encounter any issues with the search results page, reload and it'll gener Also when you navigate to the `/_export` path or kill the server, the JSON of your current internet will be saved to the file `internet.json` in the root of the project. Right now you can't load it back yet but maybe I'll add that in the future if I want, or you could fork it and add it yourself the code isn't very complicated at all. ## How do I run this??? -Simple, first install Ollama [here](https://ollama.com/download), then pull your model of choice. The one I used is [Llama 3 8B Instruct](https://ollama.com/library/llama3) which works really well and is very impressive for an 8B model. +Simple, first install Ollama [here](https://ollama.com/download), then pull your model of choice. The one I used is [Llama 3 8B Instruct](https://ollama.com/library/llama3) which works really well and is very impressive for an 8B model. If you don't want to use Ollama you can use any other OpenAI-compatible server by modifying the `client` declaration in ReaperEngine.py to link to your server, I recommend [llama.cpp's server example](https://github.com/ggerganov/llama.cpp/tree/master/examples/server) for something lightweight, or [text-generation-webui](https://github.com/oobabooga/text-generation-webui/) for a fully featured LLM web interface. -Next you'll need to install Python if you don't already have it, I run Python 3.10.12 (came with my Linux Mint install), then the libraries you'll need are: +Due to popular demand and it not being 12am anymore I finally added a requirements.txt file! Now instead of manually installing dependencies you can just run `pip install -r requirements.txt` in the root of the project and it'll install them all for you! + +(If you want to manually install dependenies, follow these instructions) Next you'll need to install Python if you don't already have it, I run Python 3.10.12 (came with my Linux Mint install), then the libraries you'll need are: - [OpenAI](https://pypi.org/project/openai/) - [BeautifulSoup4](https://pypi.org/project/beautifulsoup4/) - [Flask](https://pypi.org/project/Flask/) From d2a38b4e7cf308a5849ddcbd6bf8f358e7d0e719 Mon Sep 17 00:00:00 2001 From: sebc <47074056+Sebby37@users.noreply.github.com> Date: Thu, 25 Apr 2024 15:18:48 +0930 Subject: [PATCH 10/11] Upgrade some of the prompts, make code a little better --- ReaperEngine.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ReaperEngine.py b/ReaperEngine.py index 4497575..8336f55 100644 --- a/ReaperEngine.py +++ b/ReaperEngine.py @@ -96,7 +96,7 @@ def get_page(self, url, path, query=None): except: pass # Construct the basic prompt - prompt = f"Give me a classic geocities-style webpage from the fictional site of '{url}' at the resource path of '{path}'. Make sure all links generated either link to an external website, or if they link to another resource on the current website have the current url prepended ({url}) to them. For example if a link on the page has the href of 'help' or '/help', it should be replaced with '{url}/path'." + prompt = f"Give me a classic geocities-style webpage from the fictional site of '{url}' at the resource path of '{path}'. Make sure all links generated either link to an external website, or if they link to another resource on the current website have the current url prepended ({url}) to them. For example if a link on the page has the href of 'help' or '/help', it should be replaced with '{url}/path'. All your links must use absolute paths, do not shorten anything. Make the page look nice and unique using internal CSS stylesheets, don't make the pages look boring or generic." # TODO: I wanna add all other pages to the prompt so the next pages generated resemble them, but since Llama 3 is only 8k context I hesitate to do so # Add other pages to the prompt if they exist @@ -118,8 +118,12 @@ def get_page(self, url, path, query=None): max_tokens=self.max_tokens ) - # Add the page to the database + # Get and format the page generated_page = generated_page_completion.choices[0].message.content + open("curpage.html", "w+").write(generated_page) + generated_page = self._format_page(generated_page) + + # Add the page to the database if not url in self.internet_db: self.internet_db[url] = dict() self.internet_db[url][path] = self._format_page(generated_page) @@ -136,7 +140,7 @@ def get_search(self, query): }, { "role": "user", - "content": f"Generate the search results page for a ficticious search engine where the search query is '{query}'. Please include at least 10 results to different ficticious websites that relate to the query. DO NOT link to any real websites, every link should lead to a ficticious website. Feel free to add a bit of CSS to make the page look nice. Each search result will link to its own unique website that has nothing to do with the search engine. Make sure each ficticious website has a unique and somewhat creative URL. Don't mention that the results are ficticious." + "content": f"Generate the search results page for a ficticious search engine where the search query is '{query}'. Please include at least 10 results to different ficticious websites that relate to the query. DO NOT link to any real websites, every link should lead to a ficticious website. Feel free to add a bit of CSS to make the page look nice. Each search result will link to its own unique website that has nothing to do with the search engine and is not a path or webpage on the search engine's site. Make sure each ficticious website has a unique and somewhat creative URL. Don't mention that the results are ficticious." }], model="llama3", temperature=self.temperature, From 19f242ff3b7442bfd53d3a05b48e9cc1114ddb54 Mon Sep 17 00:00:00 2001 From: Zetaphor Date: Thu, 25 Apr 2024 21:45:14 -0500 Subject: [PATCH 11/11] Fix units check on image width --- ReaperEngine.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ReaperEngine.py b/ReaperEngine.py index 8336f55..2baa198 100644 --- a/ReaperEngine.py +++ b/ReaperEngine.py @@ -74,9 +74,13 @@ def _format_page(self, dirty_html): # Assign a random width between 100 and 300px if width is not present img["width"] = str(random.randint(100, 300)) else: - # Ensure the width does not exceed the max width - if int(img["width"]) > int(os.getenv("MAX_IMAGE_WIDTH")): - img["width"] = os.getenv("MAX_IMAGE_WIDTH") + # Use regular expression to find digits in the width value + width = re.findall(r'\d+', img["width"])[0] + max_width = re.findall(r'\d+', os.getenv("MAX_IMAGE_WIDTH"))[0] + + # Convert the extracted strings to integers + if int(width) > int(max_width): + img["width"] = max_width alt_text = img.get("alt", "") new_src = self.image_search(alt_text)