example-web-crawler/links.aro at main · arolang/example-web-crawler · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
(* ============================================================
   ARO Web Crawler - Link Extraction using ParseHtml Action

   Uses the built-in ParseHtml action for proper HTML parsing.
   ============================================================ *)

(Extract Links: ExtractLinks Handler) {
    (* Typed event extraction - validates against ExtractLinksEvent schema *)
    Extract the <event-data: ExtractLinksEvent> from the <event>.

    (* Use ParseHtml action to extract all href attributes from anchor tags *)
    ParseHtml the <links: links> from the <event-data: html>.

    (* Process links in parallel - repository Actor ensures atomic dedup *)
    parallel for each <raw-url> in <links> {
        Emit a <NormalizeUrl: event> with {
            raw: <raw-url>,
            source: <event-data: url>,
            base: <event-data: base>
        }.
    }

    Return an <OK: status> for the <extraction>.
}

(Normalize URL: NormalizeUrl Handler) {
    (* Extract from event data structure *)
    Extract the <raw-url> from the <event: raw>.
    Extract the <source-url> from the <event: source>.
    Extract the <base-domain> from the <event: base>.

    (* Determine URL type and normalize *)
    match <raw-url> {
        case /^https?:\/\// {
            (* Already absolute URL - strip fragment and trailing slash *)
            Split the <frag-parts> from the <raw-url> by /#/.
            Extract the <no-fragment: first> from the <frag-parts>.
            Split the <slash-parts> from the <no-fragment> by /\/+$/.
            Extract the <clean-url: first> from the <slash-parts>.
            Emit a <FilterUrl: event> with { url: <clean-url>, base: <base-domain> }.
        }
        case /^\/$/ {
            (* Just "/" means root - resolve to origin (scheme+host) *)
            Split the <root-scheme-parts> from the <base-domain> by /:\/\//.
            Extract the <root-scheme: first> from the <root-scheme-parts>.
            Extract the <root-hostpath: last> from the <root-scheme-parts>.
            Split the <root-hostsegs> from the <root-hostpath> by /\//.
            Extract the <root-host: first> from the <root-hostsegs>.
            Create the <root-origin> with "${<root-scheme>}://${<root-host>}".
            Emit a <FilterUrl: event> with { url: <root-origin>, base: <base-domain> }.
        }
        case /^\// {
            (* Root-relative URL: prepend origin (scheme+host), strip fragment and trailing slash *)
            Split the <scheme-parts> from the <base-domain> by /:\/\//.
            Extract the <scheme: first> from the <scheme-parts>.
            Extract the <hostpath: last> from the <scheme-parts>.
            Split the <hostsegs> from the <hostpath> by /\//.
            Extract the <host: first> from the <hostsegs>.
            Create the <origin> with "${<scheme>}://${<host>}".
            Create the <joined-url> with "${<origin>}${<raw-url>}".
            Split the <frag-parts> from the <joined-url> by /#/.
            Extract the <no-fragment: first> from the <frag-parts>.
            Split the <slash-parts> from the <no-fragment> by /\/+$/.
            Extract the <clean-url: first> from the <slash-parts>.
            Emit a <FilterUrl: event> with { url: <clean-url>, base: <base-domain> }.
        }
        case /^(#|mailto:|javascript:|tel:|data:|\.)/ {
            (* Skip fragments, special URLs, and relative parent paths *)
        }
        case /[a-zA-Z]/ {
            (* Relative URL (e.g., "getting-started.html") - resolve against base *)
            Split the <src-frag> from the <raw-url> by /#/.
            Extract the <rel-nofrag: first> from the <src-frag>.
            Create the <rel-joined> with "${<base-domain>}/${<rel-nofrag>}".
            Split the <rel-slash> from the <rel-joined> by /\/+$/.
            Extract the <rel-clean: first> from the <rel-slash>.
            Emit a <FilterUrl: event> with { url: <rel-clean>, base: <base-domain> }.
        }
    }

    Return an <OK: status> for the <normalization>.
}

(Filter URL: FilterUrl Handler) {
    (* Extract from event data structure *)
    Extract the <url> from the <event: url>.
    Extract the <base-domain> from the <event: base>.

    (* Filter URLs that belong to the same domain as base-domain *)
    Emit a <QueueUrl: event> with { url: <url>, base: <base-domain> } when <url> contains <base-domain>.

    Return an <OK: status> for the <filter>.
}

(Queue URL: QueueUrl Handler) {
    (* Extract from event data structure *)
    Extract the <url> from the <event: url>.
    Extract the <base-domain> from the <event: base>.

    (* Generate deterministic id from URL hash for deduplication *)
    Compute the <url-id: hash> from the <url>.

    (* Store with id - repository deduplicates by id, observer only fires for new entries *)
    Create the <crawl-request> with { id: <url-id>, url: <url>, base: <base-domain> }.
    Store the <crawl-request> into the <crawled-repository>.

    Return an <OK: status> for the <queue>.
}

(Trigger Crawl: crawled-repository Observer) {
    (* React to new entries in the repository *)
    Extract the <crawl-request> from the <event: newValue>.
    Extract the <url> from the <crawl-request: url>.
    Extract the <base-domain> from the <crawl-request: base>.

    Log "Queued: ${<url>}" to the <console> when <env: DEBUG> == "1".
    Emit a <CrawlPage: event> with { url: <url>, base: <base-domain> }.

    Return an <OK: status> for the <observer>.
}