Simplify crawler with atomic Store dedup and remove Keepalive

KrisSimon · KrisSimon · commit 0ddd3b4c2f00 · 2026-01-27T09:51:20.000+01:00
- Remove dedup logic from crawler.aro (no more Retrieve/difference/union)
- Use atomic &lt;Store&gt; with new-entry binding in QueueUrl for race-safe dedup
- Remove Keepalive from main.aro (Emit blocks until chain completes)
- Emit QueueUrl instead of CrawlPage from Application-Start
- Add fragment stripping and trailing slash removal in NormalizeUrl
- Remove debug logging from ExtractLinks and FilterUrl handlers
diff --git a/crawler.aro b/crawler.aro
@@ -6,38 +6,15 @@
    ============================================================ *)
 
 (Crawl Page: CrawlPage Handler) {
-    <Log> "CrawlPage handler triggered" to the <console>.
-
     (* Extract from event data *)
     <Extract> the <event-data> from the <event: data>.
     <Extract> the <url> from the <event-data: url>.
     <Extract> the <base-domain> from the <event-data: base>.
 
-    <Log> "Extracted URL: ${<url>}" to the <console>.
-
-    (* Check if already crawled *)
-    <Retrieve> the <crawled-urls> from the <crawled-repository>.
-    <Create> the <single-url-list> with [<url>].
-    <Compute> the <new-urls: difference> from <single-url-list> with <crawled-urls>.
-    <Compute> the <new-url-count: count> from <new-urls>.
-
-    (* Skip if already crawled - use match to check count *)
-    match <new-url-count> {
-        case 0 {
-            <Return> an <OK: status> for the <skip>.
-        }
-    }
-
     <Log> "Crawling: ${<url>}" to the <console>.
 
-    (* Mark URL as crawled before fetching to prevent duplicate requests *)
-    <Compute> the <updated-crawled: union> from <crawled-urls> with <single-url-list>.
-    <Store> the <updated-crawled> into the <crawled-repository>.
-
     (* Fetch the page *)
     <Request> the <html> from the <url>.
-    <Compute> the <html-len: length> from the <html>.
-    <Log> "Fetched HTML length: ${<html-len>}" to the <console>.
 
     (* Extract markdown content from HTML using ParseHtml action *)
     <ParseHtml> the <markdown-result: markdown> from the <html>.
diff --git a/links.aro b/links.aro
@@ -5,8 +5,6 @@
    ============================================================ *)
 
 (Extract Links: ExtractLinks Handler) {
-    <Log> "ExtractLinks handler triggered" to the <console>.
-
     (* Extract from event data structure *)
     <Extract> the <event-data> from the <event: data>.
     <Extract> the <html> from the <event-data: html>.
@@ -15,10 +13,8 @@
 
     (* Use ParseHtml action to extract all href attributes from anchor tags *)
     <ParseHtml> the <links: links> from the <html>.
-    <Compute> the <link-count: count> from the <links>.
-    <Log> "Found ${<link-count>} links" to the <console>.
 
-    (* Process each extracted link using parallel for *)
+    (* Process links in parallel - repository Actor ensures atomic dedup *)
     parallel for each <raw-url> in <links> {
         <Emit> a <NormalizeUrl: event> with {
             raw: <raw-url>,
@@ -40,17 +36,25 @@
     (* Determine URL type and normalize *)
     match <raw-url> {
         case /^https?:\/\// {
-            (* Already absolute URL *)
-            <Emit> a <FilterUrl: event> with { url: <raw-url>, base: <base-domain> }.
+            (* Already absolute URL - strip fragment and trailing slash *)
+            <Split> the <frag-parts> from the <raw-url> by /#/.
+            <Extract> the <no-fragment: first> from the <frag-parts>.
+            <Split> the <slash-parts> from the <no-fragment> by /\/+$/.
+            <Extract> the <clean-url: first> from the <slash-parts>.
+            <Emit> a <FilterUrl: event> with { url: <clean-url>, base: <base-domain> }.
         }
         case /^\/$/ {
             (* Just "/" means root - use base domain as-is (no trailing slash) *)
             <Emit> a <FilterUrl: event> with { url: <base-domain>, base: <base-domain> }.
         }
         case /^\// {
-            (* Root-relative URL: prepend base domain *)
-            <Create> the <absolute-url> with "${<base-domain>}${<raw-url>}".
-            <Emit> a <FilterUrl: event> with { url: <absolute-url>, base: <base-domain> }.
+            (* Root-relative URL: prepend base domain, strip fragment and trailing slash *)
+            <Create> the <joined-url> with "${<base-domain>}${<raw-url>}".
+            <Split> the <frag-parts> from the <joined-url> by /#/.
+            <Extract> the <no-fragment: first> from the <frag-parts>.
+            <Split> the <slash-parts> from the <no-fragment> by /\/+$/.
+            <Extract> the <clean-url: first> from the <slash-parts>.
+            <Emit> a <FilterUrl: event> with { url: <clean-url>, base: <base-domain> }.
         }
         case /^(#|mailto:|javascript:|tel:|data:)/ {
             (* Skip fragments and special URLs *)
@@ -67,7 +71,6 @@
     <Extract> the <base-domain> from the <event-data: base>.
 
     (* Filter URLs that belong to the same domain as base-domain *)
-    <Log> "Queuing: ${<url>}" to the <console> when <url> contains <base-domain>.
     <Emit> a <QueueUrl: event> with { url: <url>, base: <base-domain> } when <url> contains <base-domain>.
 
     <Return> an <OK: status> for the <filter>.
@@ -79,15 +82,13 @@
     <Extract> the <url> from the <event-data: url>.
     <Extract> the <base-domain> from the <event-data: base>.
 
-    (* Check if already crawled *)
-    <Retrieve> the <crawled-urls> from the <crawled-repository>.
-    <Create> the <single-url-list> with [<url>].
-    <Compute> the <uncrawled-urls: difference> from <single-url-list> with <crawled-urls>.
-    <Compute> the <uncrawled-count: count> from <uncrawled-urls>.
+    (* Atomic store - the repository Actor serializes concurrent access,
+       so only the first caller for a given URL gets is-new-entry = 1 *)
+    <Store> the <url> into the <crawled-repository>.
 
-    (* Only queue if not already crawled *)
-    <Log> "Queued: ${<url>}" to the <console> when <uncrawled-count> > 0.
-    <Emit> a <CrawlPage: event> with { url: <url>, base: <base-domain> } when <uncrawled-count> > 0.
+    (* Only emit CrawlPage if this URL was newly stored *)
+    <Log> "Queued: ${<url>}" to the <console> when <new-entry> > 0.
+    <Emit> a <CrawlPage: event> with { url: <url>, base: <base-domain> } when <new-entry> > 0.
 
     <Return> an <OK: status> for the <queue>.
 }
diff --git a/main.aro b/main.aro
@@ -17,15 +17,8 @@
     <Make> the <output-dir> to the <directory: output-path>.
     <Log> "Output directory created" to the <console>.
 
-    (* Initialize empty crawled URLs set *)
-    <Create> the <crawled-urls> with [].
-    <Store> the <crawled-urls> into the <crawled-repository>.
-
-    (* Start crawling *)
-    <Emit> a <CrawlPage: event> with { url: <start-url>, base: <start-url> }.
-
-    (* Keep application alive to process events *)
-    <Keepalive> the <application> for the <events>.
+    (* Queue initial URL - Emit blocks until the entire crawl chain completes *)
+    <Emit> a <QueueUrl: event> with { url: <start-url>, base: <start-url> }.
 
     <Return> an <OK: status> for the <startup>.
 }