Skip to content

Commit 0ddd3b4

Browse files
committed
Simplify crawler with atomic Store dedup and remove Keepalive
- Remove dedup logic from crawler.aro (no more Retrieve/difference/union) - Use atomic <Store> with new-entry binding in QueueUrl for race-safe dedup - Remove Keepalive from main.aro (Emit blocks until chain completes) - Emit QueueUrl instead of CrawlPage from Application-Start - Add fragment stripping and trailing slash removal in NormalizeUrl - Remove debug logging from ExtractLinks and FilterUrl handlers
1 parent e01ed6c commit 0ddd3b4

File tree

3 files changed

+22
-51
lines changed

3 files changed

+22
-51
lines changed

crawler.aro

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,38 +6,15 @@
66
============================================================ *)
77

88
(Crawl Page: CrawlPage Handler) {
9-
<Log> "CrawlPage handler triggered" to the <console>.
10-
119
(* Extract from event data *)
1210
<Extract> the <event-data> from the <event: data>.
1311
<Extract> the <url> from the <event-data: url>.
1412
<Extract> the <base-domain> from the <event-data: base>.
1513

16-
<Log> "Extracted URL: ${<url>}" to the <console>.
17-
18-
(* Check if already crawled *)
19-
<Retrieve> the <crawled-urls> from the <crawled-repository>.
20-
<Create> the <single-url-list> with [<url>].
21-
<Compute> the <new-urls: difference> from <single-url-list> with <crawled-urls>.
22-
<Compute> the <new-url-count: count> from <new-urls>.
23-
24-
(* Skip if already crawled - use match to check count *)
25-
match <new-url-count> {
26-
case 0 {
27-
<Return> an <OK: status> for the <skip>.
28-
}
29-
}
30-
3114
<Log> "Crawling: ${<url>}" to the <console>.
3215

33-
(* Mark URL as crawled before fetching to prevent duplicate requests *)
34-
<Compute> the <updated-crawled: union> from <crawled-urls> with <single-url-list>.
35-
<Store> the <updated-crawled> into the <crawled-repository>.
36-
3716
(* Fetch the page *)
3817
<Request> the <html> from the <url>.
39-
<Compute> the <html-len: length> from the <html>.
40-
<Log> "Fetched HTML length: ${<html-len>}" to the <console>.
4118

4219
(* Extract markdown content from HTML using ParseHtml action *)
4320
<ParseHtml> the <markdown-result: markdown> from the <html>.

links.aro

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,6 @@
55
============================================================ *)
66

77
(Extract Links: ExtractLinks Handler) {
8-
<Log> "ExtractLinks handler triggered" to the <console>.
9-
108
(* Extract from event data structure *)
119
<Extract> the <event-data> from the <event: data>.
1210
<Extract> the <html> from the <event-data: html>.
@@ -15,10 +13,8 @@
1513

1614
(* Use ParseHtml action to extract all href attributes from anchor tags *)
1715
<ParseHtml> the <links: links> from the <html>.
18-
<Compute> the <link-count: count> from the <links>.
19-
<Log> "Found ${<link-count>} links" to the <console>.
2016

21-
(* Process each extracted link using parallel for *)
17+
(* Process links in parallel - repository Actor ensures atomic dedup *)
2218
parallel for each <raw-url> in <links> {
2319
<Emit> a <NormalizeUrl: event> with {
2420
raw: <raw-url>,
@@ -40,17 +36,25 @@
4036
(* Determine URL type and normalize *)
4137
match <raw-url> {
4238
case /^https?:\/\// {
43-
(* Already absolute URL *)
44-
<Emit> a <FilterUrl: event> with { url: <raw-url>, base: <base-domain> }.
39+
(* Already absolute URL - strip fragment and trailing slash *)
40+
<Split> the <frag-parts> from the <raw-url> by /#/.
41+
<Extract> the <no-fragment: first> from the <frag-parts>.
42+
<Split> the <slash-parts> from the <no-fragment> by /\/+$/.
43+
<Extract> the <clean-url: first> from the <slash-parts>.
44+
<Emit> a <FilterUrl: event> with { url: <clean-url>, base: <base-domain> }.
4545
}
4646
case /^\/$/ {
4747
(* Just "/" means root - use base domain as-is (no trailing slash) *)
4848
<Emit> a <FilterUrl: event> with { url: <base-domain>, base: <base-domain> }.
4949
}
5050
case /^\// {
51-
(* Root-relative URL: prepend base domain *)
52-
<Create> the <absolute-url> with "${<base-domain>}${<raw-url>}".
53-
<Emit> a <FilterUrl: event> with { url: <absolute-url>, base: <base-domain> }.
51+
(* Root-relative URL: prepend base domain, strip fragment and trailing slash *)
52+
<Create> the <joined-url> with "${<base-domain>}${<raw-url>}".
53+
<Split> the <frag-parts> from the <joined-url> by /#/.
54+
<Extract> the <no-fragment: first> from the <frag-parts>.
55+
<Split> the <slash-parts> from the <no-fragment> by /\/+$/.
56+
<Extract> the <clean-url: first> from the <slash-parts>.
57+
<Emit> a <FilterUrl: event> with { url: <clean-url>, base: <base-domain> }.
5458
}
5559
case /^(#|mailto:|javascript:|tel:|data:)/ {
5660
(* Skip fragments and special URLs *)
@@ -67,7 +71,6 @@
6771
<Extract> the <base-domain> from the <event-data: base>.
6872

6973
(* Filter URLs that belong to the same domain as base-domain *)
70-
<Log> "Queuing: ${<url>}" to the <console> when <url> contains <base-domain>.
7174
<Emit> a <QueueUrl: event> with { url: <url>, base: <base-domain> } when <url> contains <base-domain>.
7275

7376
<Return> an <OK: status> for the <filter>.
@@ -79,15 +82,13 @@
7982
<Extract> the <url> from the <event-data: url>.
8083
<Extract> the <base-domain> from the <event-data: base>.
8184

82-
(* Check if already crawled *)
83-
<Retrieve> the <crawled-urls> from the <crawled-repository>.
84-
<Create> the <single-url-list> with [<url>].
85-
<Compute> the <uncrawled-urls: difference> from <single-url-list> with <crawled-urls>.
86-
<Compute> the <uncrawled-count: count> from <uncrawled-urls>.
85+
(* Atomic store - the repository Actor serializes concurrent access,
86+
so only the first caller for a given URL gets is-new-entry = 1 *)
87+
<Store> the <url> into the <crawled-repository>.
8788

88-
(* Only queue if not already crawled *)
89-
<Log> "Queued: ${<url>}" to the <console> when <uncrawled-count> > 0.
90-
<Emit> a <CrawlPage: event> with { url: <url>, base: <base-domain> } when <uncrawled-count> > 0.
89+
(* Only emit CrawlPage if this URL was newly stored *)
90+
<Log> "Queued: ${<url>}" to the <console> when <new-entry> > 0.
91+
<Emit> a <CrawlPage: event> with { url: <url>, base: <base-domain> } when <new-entry> > 0.
9192

9293
<Return> an <OK: status> for the <queue>.
9394
}

main.aro

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,8 @@
1717
<Make> the <output-dir> to the <directory: output-path>.
1818
<Log> "Output directory created" to the <console>.
1919

20-
(* Initialize empty crawled URLs set *)
21-
<Create> the <crawled-urls> with [].
22-
<Store> the <crawled-urls> into the <crawled-repository>.
23-
24-
(* Start crawling *)
25-
<Emit> a <CrawlPage: event> with { url: <start-url>, base: <start-url> }.
26-
27-
(* Keep application alive to process events *)
28-
<Keepalive> the <application> for the <events>.
20+
(* Queue initial URL - Emit blocks until the entire crawl chain completes *)
21+
<Emit> a <QueueUrl: event> with { url: <start-url>, base: <start-url> }.
2922

3023
<Return> an <OK: status> for the <startup>.
3124
}

0 commit comments

Comments
 (0)