diff --git a/README.md b/README.md index f32e1d2..a4904e7 100644 --- a/README.md +++ b/README.md @@ -50,6 +50,12 @@ HTML-filer kan publiceras via: - **Cloudflare R2**: Med `html-export-workflow.yml` (kräver R2-credentials) - **GitHub Pages**: Med `github-pages-workflow.yml` (enklare setup, kräver aktiverad GitHub Pages) +För att testa HTML-sajten lokalt: +```bash +python scripts/serve_html.py +``` +Detta startar en lokal HTTP-server på `http://localhost:8000`. Du kan ange en annan port med `python scripts/serve_html.py 3000`. + ### Vektor-format (för semantisk sökning) - **`vector`**: Konverterar författningar till vektorembeddings för semantisk sökning och RAG-applikationer. Använder OpenAI:s text-embedding-3-large modell (3072 dimensioner) och stödjer lagring i PostgreSQL (pgvector), Elasticsearch eller JSON-fil. diff --git a/README_EN.md b/README_EN.md index 09c7ab7..8b31635 100644 --- a/README_EN.md +++ b/README_EN.md @@ -50,6 +50,12 @@ HTML files can be published via: - **Cloudflare R2**: Using `html-export-workflow.yml` (requires R2 credentials) - **GitHub Pages**: Using `github-pages-workflow.yml` (simpler setup, requires GitHub Pages enabled) +To test the HTML site locally: +```bash +python scripts/serve_html.py +``` +This starts a local HTTP server on `http://localhost:8000`. You can specify a different port with `python scripts/serve_html.py 3000`. + ### Vector Format (for semantic search) - **`vector`**: Converts legislation to vector embeddings for semantic search and RAG applications. Uses OpenAI's text-embedding-3-large model (3072 dimensions) and supports storage in PostgreSQL (pgvector), Elasticsearch, or JSON file. diff --git a/exporters/html/html_export.py b/exporters/html/html_export.py index cc88b38..41470a5 100644 --- a/exporters/html/html_export.py +++ b/exporters/html/html_export.py @@ -348,19 +348,25 @@ def convert_to_html(data: Dict[str, Any], apply_amendments: bool = False, up_to_ def make_links_relative(html_content: str) -> str: """ Strip base URL from links to make them relative for HTML export. - - Removes https://selex.se/eli from links to make them relative. - + + Removes base domain (e.g., https://selex.se/eli) from links to make them relative. + This works with to resolve paths correctly. + Args: html_content (str): HTML content with potentially absolute links - + Returns: str: HTML content with relative links """ - # Pattern to match https://selex.se/eli in href attributes - pattern = r'href="https://selex\.se/eli(/[^"]*)"' + # Get base URL from environment (with /eli/) + base_url = os.getenv('INTERNAL_LINKS_BASE_URL', 'https://selex.se/eli') + + # Pattern to match base_url in href attributes + # Escape special regex characters in base_url + escaped_base = re.escape(base_url) + pattern = rf'href="{escaped_base}(/[^"]*)"' replacement = r'href="\1"' - + return re.sub(pattern, replacement, html_content) @@ -651,6 +657,7 @@ def create_html_head(title: str, beteckning: str, additional_styles: str = "", a head = f""" + {html.escape(title)}{eli_metadata} {navbar_script} {base_styles} diff --git a/formatters/apply_links.py b/formatters/apply_links.py index d6e1967..153de0f 100644 --- a/formatters/apply_links.py +++ b/formatters/apply_links.py @@ -53,7 +53,7 @@ def apply_sfs_links(text: str) -> str: # Matchar mönster som "2002:43", "1970:485", etc. sfs_pattern = SFS_PATTERN - # Använd alltid https://selex.se/eli som default om env variabel inte finns + # Använd base URL med /eli/ prefix (default: https://selex.se/eli) base_url = os.getenv('INTERNAL_LINKS_BASE_URL', 'https://selex.se/eli') # TODO: Slå upp SFS-beteckning mot JSON-fil för att verifiera giltighet @@ -238,9 +238,9 @@ def apply_law_name_links(text: str) -> str: if not law_names_data: return text - # Hämta bas-URL från miljövariabler - använd alltid https://selex.se/eli som default + # Hämta bas-URL från miljövariabler (med /eli/ prefix) base_url = os.getenv('INTERNAL_LINKS_BASE_URL', 'https://selex.se/eli') - + # Processar texten rad för rad för att undvika att länka rubriker lines = text.split('\n') processed_lines = [] @@ -257,22 +257,22 @@ def replace_law_name_reference(match): paragraph_part = match.group(2).strip() law_name = match.group(3).lower() full_match = match.group(0) - + # Leta upp lagnamnet i data sfs_id = _lookup_law_name(law_name, law_names_data) - + if not sfs_id: print(f"Varning: Okänt lagnamn '{law_name}' i referens '{full_match}'") return full_match # Returnera oförändrat om lagnamnet inte hittas - + # Extrahera år och nummer från SFS-ID (format: "YYYY:NNN") id_parts = sfs_id.split(':') if len(id_parts) != 2: print(f"Varning: Ogiltigt SFS-ID format '{sfs_id}' för lagnamn '{law_name}'") return full_match - + year, number = id_parts - + # Skapa bas-URL url = f"{base_url}/sfs/{year}/{number}" diff --git a/scripts/serve_html.py b/scripts/serve_html.py new file mode 100755 index 0000000..4245de9 --- /dev/null +++ b/scripts/serve_html.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +""" +Simple HTTP server to serve the HTML site locally for testing. +Run with: python serve_html.py [port] +Default port is 8000. +""" + +import http.server +import socketserver +import sys +import os +from pathlib import Path + +def main(): + # Default port + PORT = 8000 + + # Allow custom port from command line + if len(sys.argv) > 1: + try: + PORT = int(sys.argv[1]) + except ValueError: + print(f"Error: Invalid port number '{sys.argv[1]}'") + sys.exit(1) + + # Change to the HTML site directory (go up to project root first) + html_dir = Path(__file__).parent.parent / "output" / "html_site" + + if not html_dir.exists(): + print(f"Error: HTML site directory not found at {html_dir}") + print("Run the HTML export first with:") + print(" python sfs_processor.py --formats html") + sys.exit(1) + + os.chdir(html_dir) + + # Create server + Handler = http.server.SimpleHTTPRequestHandler + + with socketserver.TCPServer(("", PORT), Handler) as httpd: + print(f"Serving HTML site at http://localhost:{PORT}") + print(f"Directory: {html_dir}") + print("Press Ctrl+C to stop") + + try: + httpd.serve_forever() + except KeyboardInterrupt: + print("\nShutting down server...") + +if __name__ == "__main__": + main()