-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlinks.aro
More file actions
executable file
·120 lines (102 loc) · 5.34 KB
/
links.aro
File metadata and controls
executable file
·120 lines (102 loc) · 5.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
(* ============================================================
ARO Web Crawler - Link Extraction using ParseHtml Action
Uses the built-in ParseHtml action for proper HTML parsing.
============================================================ *)
(Extract Links: ExtractLinks Handler) {
(* Typed event extraction - validates against ExtractLinksEvent schema *)
Extract the <event-data: ExtractLinksEvent> from the <event>.
(* Use ParseHtml action to extract all href attributes from anchor tags *)
ParseHtml the <links: links> from the <event-data: html>.
(* Process links in parallel - repository Actor ensures atomic dedup *)
parallel for each <raw-url> in <links> {
Emit a <NormalizeUrl: event> with {
raw: <raw-url>,
source: <event-data: url>,
base: <event-data: base>
}.
}
Return an <OK: status> for the <extraction>.
}
(Normalize URL: NormalizeUrl Handler) {
(* Extract from event data structure *)
Extract the <raw-url> from the <event: raw>.
Extract the <source-url> from the <event: source>.
Extract the <base-domain> from the <event: base>.
(* Determine URL type and normalize *)
match <raw-url> {
case /^https?:\/\// {
(* Already absolute URL - strip fragment and trailing slash *)
Split the <frag-parts> from the <raw-url> by /#/.
Extract the <no-fragment: first> from the <frag-parts>.
Split the <slash-parts> from the <no-fragment> by /\/+$/.
Extract the <clean-url: first> from the <slash-parts>.
Emit a <FilterUrl: event> with { url: <clean-url>, base: <base-domain> }.
}
case /^\/$/ {
(* Just "/" means root - resolve to origin (scheme+host) *)
Split the <root-scheme-parts> from the <base-domain> by /:\/\//.
Extract the <root-scheme: first> from the <root-scheme-parts>.
Extract the <root-hostpath: last> from the <root-scheme-parts>.
Split the <root-hostsegs> from the <root-hostpath> by /\//.
Extract the <root-host: first> from the <root-hostsegs>.
Create the <root-origin> with "${<root-scheme>}://${<root-host>}".
Emit a <FilterUrl: event> with { url: <root-origin>, base: <base-domain> }.
}
case /^\// {
(* Root-relative URL: prepend origin (scheme+host), strip fragment and trailing slash *)
Split the <scheme-parts> from the <base-domain> by /:\/\//.
Extract the <scheme: first> from the <scheme-parts>.
Extract the <hostpath: last> from the <scheme-parts>.
Split the <hostsegs> from the <hostpath> by /\//.
Extract the <host: first> from the <hostsegs>.
Create the <origin> with "${<scheme>}://${<host>}".
Create the <joined-url> with "${<origin>}${<raw-url>}".
Split the <frag-parts> from the <joined-url> by /#/.
Extract the <no-fragment: first> from the <frag-parts>.
Split the <slash-parts> from the <no-fragment> by /\/+$/.
Extract the <clean-url: first> from the <slash-parts>.
Emit a <FilterUrl: event> with { url: <clean-url>, base: <base-domain> }.
}
case /^(#|mailto:|javascript:|tel:|data:|\.)/ {
(* Skip fragments, special URLs, and relative parent paths *)
}
case /[a-zA-Z]/ {
(* Relative URL (e.g., "getting-started.html") - resolve against base *)
Split the <src-frag> from the <raw-url> by /#/.
Extract the <rel-nofrag: first> from the <src-frag>.
Create the <rel-joined> with "${<base-domain>}/${<rel-nofrag>}".
Split the <rel-slash> from the <rel-joined> by /\/+$/.
Extract the <rel-clean: first> from the <rel-slash>.
Emit a <FilterUrl: event> with { url: <rel-clean>, base: <base-domain> }.
}
}
Return an <OK: status> for the <normalization>.
}
(Filter URL: FilterUrl Handler) {
(* Extract from event data structure *)
Extract the <url> from the <event: url>.
Extract the <base-domain> from the <event: base>.
(* Filter URLs that belong to the same domain as base-domain *)
Emit a <QueueUrl: event> with { url: <url>, base: <base-domain> } when <url> contains <base-domain>.
Return an <OK: status> for the <filter>.
}
(Queue URL: QueueUrl Handler) {
(* Extract from event data structure *)
Extract the <url> from the <event: url>.
Extract the <base-domain> from the <event: base>.
(* Generate deterministic id from URL hash for deduplication *)
Compute the <url-id: hash> from the <url>.
(* Store with id - repository deduplicates by id, observer only fires for new entries *)
Create the <crawl-request> with { id: <url-id>, url: <url>, base: <base-domain> }.
Store the <crawl-request> into the <crawled-repository>.
Return an <OK: status> for the <queue>.
}
(Trigger Crawl: crawled-repository Observer) {
(* React to new entries in the repository *)
Extract the <crawl-request> from the <event: newValue>.
Extract the <url> from the <crawl-request: url>.
Extract the <base-domain> from the <crawl-request: base>.
Log "Queued: ${<url>}" to the <console> when <env: DEBUG> == "1".
Emit a <CrawlPage: event> with { url: <url>, base: <base-domain> }.
Return an <OK: status> for the <observer>.
}