Optimize code

KrisSimon · KrisSimon · commit 056c96983c92 · 2026-01-25T13:19:50.000+01:00
diff --git a/crawler.aro b/crawler.aro
@@ -13,8 +13,7 @@
     <Extract> the <url> from the <event-data: url>.
     <Extract> the <base-domain> from the <event-data: base>.
 
-    <Log> "Extracted URL:" to the <console>.
-    <Log> <url> to the <console>.
+    <Log> "Extracted URL: ${<url>}" to the <console>.
 
     (* Check if already crawled *)
     <Retrieve> the <crawled-urls> from the <crawled-repository>.
@@ -23,25 +22,22 @@
     <Compute> the <new-url-count: count> from <new-urls>.
 
     (* Skip if already crawled - use match to check count *)
-    <Log> "Checking URL..." to the <console>.
     match <new-url-count> {
         case 0 {
             <Return> an <OK: status> for the <skip>.
         }
     }
 
-    <Log> "Crawling:" to the <console>.
-    <Log> <url> to the <console>.
+    <Log> "Crawling: ${<url>}" to the <console>.
 
     (* Mark URL as crawled before fetching to prevent duplicate requests *)
     <Compute> the <updated-crawled: union> from <crawled-urls> with <single-url-list>.
     <Store> the <updated-crawled> into the <crawled-repository>.
 
     (* Fetch the page *)
     <Request> the <html> from the <url>.
-    <Log> "Fetched HTML length:" to the <console>.
     <Compute> the <html-len: length> from the <html>.
-    <Log> <html-len> to the <console>.
+    <Log> "Fetched HTML length: ${<html-len>}" to the <console>.
 
     (* Extract markdown content from HTML using ParseHtml action *)
     <ParseHtml> the <markdown-result: markdown> from the <html>.
diff --git a/links.aro b/links.aro
@@ -16,9 +16,7 @@
     (* Use ParseHtml action to extract all href attributes from anchor tags *)
     <ParseHtml> the <links: links> from the <html>.
     <Compute> the <link-count: count> from the <links>.
-    <Log> "Found" to the <console>.
-    <Log> <link-count> to the <console>.
-    <Log> "links" to the <console>.
+    <Log> "Found ${<link-count>} links" to the <console>.
 
     (* Process each extracted link using parallel for *)
     parallel for each <raw-url> in <links> {
@@ -54,20 +52,8 @@
             <Create> the <absolute-url> with "${<base-domain>}${<raw-url>}".
             <Emit> a <FilterUrl: event> with { url: <absolute-url>, base: <base-domain> }.
         }
-        case /^#/ {
-            (* Fragment only - skip *)
-        }
-        case /^mailto:/ {
-            (* Email link - skip *)
-        }
-        case /^javascript:/ {
-            (* JavaScript link - skip *)
-        }
-        case /^tel:/ {
-            (* Phone link - skip *)
-        }
-        case /^data:/ {
-            (* Data URL - skip *)
+        case /^(#|mailto:|javascript:|tel:|data:)/ {
+            (* Skip fragments and special URLs *)
         }
     }
 
@@ -80,16 +66,9 @@
     <Extract> the <url> from the <event-data: url>.
     <Extract> the <base-domain> from the <event-data: base>.
 
-    (* Simple check: URL must start with http:// or https:// followed by same domain *)
-    (* For now, just pass all URLs through since they've been normalized *)
-    (* The QueueUrl handler will filter duplicates *)
-    match <url> {
-        case /^https?:\/\/ausdertechnik\.de/ {
-            <Log> "Queuing:" to the <console>.
-            <Log> <url> to the <console>.
-            <Emit> a <QueueUrl: event> with { url: <url>, base: <base-domain> }.
-        }
-    }
+    (* Filter URLs that belong to the same domain as base-domain *)
+    <Log> "Queuing: ${<url>}" to the <console> when <url> contains <base-domain>.
+    <Emit> a <QueueUrl: event> with { url: <url>, base: <base-domain> } when <url> contains <base-domain>.
 
     <Return> an <OK: status> for the <filter>.
 }
@@ -107,8 +86,7 @@
     <Compute> the <uncrawled-count: count> from <uncrawled-urls>.
 
     (* Only queue if not already crawled *)
-    <Log> "Queued:" to the <console> when <uncrawled-count> > 0.
-    <Log> <url> to the <console> when <uncrawled-count> > 0.
+    <Log> "Queued: ${<url>}" to the <console> when <uncrawled-count> > 0.
     <Emit> a <CrawlPage: event> with { url: <url>, base: <base-domain> } when <uncrawled-count> > 0.
 
     <Return> an <OK: status> for the <queue>.
diff --git a/main.aro b/main.aro
@@ -10,8 +10,7 @@
     (* Read starting URL from environment *)
     <Extract> the <start-url> from the <env: CRAWL_URL>.
 
-    <Log> "Starting URL:" to the <console>.
-    <Log> <start-url> to the <console>.
+    <Log> "Starting URL: ${<start-url>}" to the <console>.
 
     (* Create output directory *)
     <Create> the <output-path> with "./output".
diff --git a/storage.aro b/storage.aro
@@ -16,9 +16,7 @@
     <Compute> the <url-hash: hash> from the <url>.
     <Create> the <file-path> with "./output/${<url-hash>}.md".
 
-    <Log> "Saving:" to the <console>.
-    <Log> <url> to the <console>.
-    <Log> <file-path> to the <console>.
+    <Log> "Saving: ${<url>} to ${<file-path>}" to the <console>.
 
     (* Format markdown file with frontmatter *)
     <Create> the <file-content> with "# ${<title>}\n\n**Source:** ${<url>}\n\n---\n\n${<content>}".