|
5 | 5 | ============================================================ *) |
6 | 6 |
|
7 | 7 | (Extract Links: ExtractLinks Handler) { |
8 | | - <Log> "ExtractLinks handler triggered" to the <console>. |
9 | | - |
10 | 8 | (* Extract from event data structure *) |
11 | 9 | <Extract> the <event-data> from the <event: data>. |
12 | 10 | <Extract> the <html> from the <event-data: html>. |
|
15 | 13 |
|
16 | 14 | (* Use ParseHtml action to extract all href attributes from anchor tags *) |
17 | 15 | <ParseHtml> the <links: links> from the <html>. |
18 | | - <Compute> the <link-count: count> from the <links>. |
19 | | - <Log> "Found ${<link-count>} links" to the <console>. |
20 | 16 |
|
21 | | - (* Process each extracted link using parallel for *) |
| 17 | + (* Process links in parallel - repository Actor ensures atomic dedup *) |
22 | 18 | parallel for each <raw-url> in <links> { |
23 | 19 | <Emit> a <NormalizeUrl: event> with { |
24 | 20 | raw: <raw-url>, |
|
40 | 36 | (* Determine URL type and normalize *) |
41 | 37 | match <raw-url> { |
42 | 38 | case /^https?:\/\// { |
43 | | - (* Already absolute URL *) |
44 | | - <Emit> a <FilterUrl: event> with { url: <raw-url>, base: <base-domain> }. |
| 39 | + (* Already absolute URL - strip fragment and trailing slash *) |
| 40 | + <Split> the <frag-parts> from the <raw-url> by /#/. |
| 41 | + <Extract> the <no-fragment: first> from the <frag-parts>. |
| 42 | + <Split> the <slash-parts> from the <no-fragment> by /\/+$/. |
| 43 | + <Extract> the <clean-url: first> from the <slash-parts>. |
| 44 | + <Emit> a <FilterUrl: event> with { url: <clean-url>, base: <base-domain> }. |
45 | 45 | } |
46 | 46 | case /^\/$/ { |
47 | 47 | (* Just "/" means root - use base domain as-is (no trailing slash) *) |
48 | 48 | <Emit> a <FilterUrl: event> with { url: <base-domain>, base: <base-domain> }. |
49 | 49 | } |
50 | 50 | case /^\// { |
51 | | - (* Root-relative URL: prepend base domain *) |
52 | | - <Create> the <absolute-url> with "${<base-domain>}${<raw-url>}". |
53 | | - <Emit> a <FilterUrl: event> with { url: <absolute-url>, base: <base-domain> }. |
| 51 | + (* Root-relative URL: prepend base domain, strip fragment and trailing slash *) |
| 52 | + <Create> the <joined-url> with "${<base-domain>}${<raw-url>}". |
| 53 | + <Split> the <frag-parts> from the <joined-url> by /#/. |
| 54 | + <Extract> the <no-fragment: first> from the <frag-parts>. |
| 55 | + <Split> the <slash-parts> from the <no-fragment> by /\/+$/. |
| 56 | + <Extract> the <clean-url: first> from the <slash-parts>. |
| 57 | + <Emit> a <FilterUrl: event> with { url: <clean-url>, base: <base-domain> }. |
54 | 58 | } |
55 | 59 | case /^(#|mailto:|javascript:|tel:|data:)/ { |
56 | 60 | (* Skip fragments and special URLs *) |
|
67 | 71 | <Extract> the <base-domain> from the <event-data: base>. |
68 | 72 |
|
69 | 73 | (* Filter URLs that belong to the same domain as base-domain *) |
70 | | - <Log> "Queuing: ${<url>}" to the <console> when <url> contains <base-domain>. |
71 | 74 | <Emit> a <QueueUrl: event> with { url: <url>, base: <base-domain> } when <url> contains <base-domain>. |
72 | 75 |
|
73 | 76 | <Return> an <OK: status> for the <filter>. |
|
79 | 82 | <Extract> the <url> from the <event-data: url>. |
80 | 83 | <Extract> the <base-domain> from the <event-data: base>. |
81 | 84 |
|
82 | | - (* Check if already crawled *) |
83 | | - <Retrieve> the <crawled-urls> from the <crawled-repository>. |
84 | | - <Create> the <single-url-list> with [<url>]. |
85 | | - <Compute> the <uncrawled-urls: difference> from <single-url-list> with <crawled-urls>. |
86 | | - <Compute> the <uncrawled-count: count> from <uncrawled-urls>. |
| 85 | + (* Atomic store - the repository Actor serializes concurrent access, |
| 86 | + so only the first caller for a given URL gets is-new-entry = 1 *) |
| 87 | + <Store> the <url> into the <crawled-repository>. |
87 | 88 |
|
88 | | - (* Only queue if not already crawled *) |
89 | | - <Log> "Queued: ${<url>}" to the <console> when <uncrawled-count> > 0. |
90 | | - <Emit> a <CrawlPage: event> with { url: <url>, base: <base-domain> } when <uncrawled-count> > 0. |
| 89 | + (* Only emit CrawlPage if this URL was newly stored *) |
| 90 | + <Log> "Queued: ${<url>}" to the <console> when <new-entry> > 0. |
| 91 | + <Emit> a <CrawlPage: event> with { url: <url>, base: <base-domain> } when <new-entry> > 0. |
91 | 92 |
|
92 | 93 | <Return> an <OK: status> for the <queue>. |
93 | 94 | } |
0 commit comments