|
16 | 16 | (* Use ParseHtml action to extract all href attributes from anchor tags *) |
17 | 17 | <ParseHtml> the <links: links> from the <html>. |
18 | 18 | <Compute> the <link-count: count> from the <links>. |
19 | | - <Log> "Found" to the <console>. |
20 | | - <Log> <link-count> to the <console>. |
21 | | - <Log> "links" to the <console>. |
| 19 | + <Log> "Found ${<link-count>} links" to the <console>. |
22 | 20 |
|
23 | 21 | (* Process each extracted link using parallel for *) |
24 | 22 | parallel for each <raw-url> in <links> { |
|
54 | 52 | <Create> the <absolute-url> with "${<base-domain>}${<raw-url>}". |
55 | 53 | <Emit> a <FilterUrl: event> with { url: <absolute-url>, base: <base-domain> }. |
56 | 54 | } |
57 | | - case /^#/ { |
58 | | - (* Fragment only - skip *) |
59 | | - } |
60 | | - case /^mailto:/ { |
61 | | - (* Email link - skip *) |
62 | | - } |
63 | | - case /^javascript:/ { |
64 | | - (* JavaScript link - skip *) |
65 | | - } |
66 | | - case /^tel:/ { |
67 | | - (* Phone link - skip *) |
68 | | - } |
69 | | - case /^data:/ { |
70 | | - (* Data URL - skip *) |
| 55 | + case /^(#|mailto:|javascript:|tel:|data:)/ { |
| 56 | + (* Skip fragments and special URLs *) |
71 | 57 | } |
72 | 58 | } |
73 | 59 |
|
|
80 | 66 | <Extract> the <url> from the <event-data: url>. |
81 | 67 | <Extract> the <base-domain> from the <event-data: base>. |
82 | 68 |
|
83 | | - (* Simple check: URL must start with http:// or https:// followed by same domain *) |
84 | | - (* For now, just pass all URLs through since they've been normalized *) |
85 | | - (* The QueueUrl handler will filter duplicates *) |
86 | | - match <url> { |
87 | | - case /^https?:\/\/ausdertechnik\.de/ { |
88 | | - <Log> "Queuing:" to the <console>. |
89 | | - <Log> <url> to the <console>. |
90 | | - <Emit> a <QueueUrl: event> with { url: <url>, base: <base-domain> }. |
91 | | - } |
92 | | - } |
| 69 | + (* Filter URLs that belong to the same domain as base-domain *) |
| 70 | + <Log> "Queuing: ${<url>}" to the <console> when <url> contains <base-domain>. |
| 71 | + <Emit> a <QueueUrl: event> with { url: <url>, base: <base-domain> } when <url> contains <base-domain>. |
93 | 72 |
|
94 | 73 | <Return> an <OK: status> for the <filter>. |
95 | 74 | } |
|
107 | 86 | <Compute> the <uncrawled-count: count> from <uncrawled-urls>. |
108 | 87 |
|
109 | 88 | (* Only queue if not already crawled *) |
110 | | - <Log> "Queued:" to the <console> when <uncrawled-count> > 0. |
111 | | - <Log> <url> to the <console> when <uncrawled-count> > 0. |
| 89 | + <Log> "Queued: ${<url>}" to the <console> when <uncrawled-count> > 0. |
112 | 90 | <Emit> a <CrawlPage: event> with { url: <url>, base: <base-domain> } when <uncrawled-count> > 0. |
113 | 91 |
|
114 | 92 | <Return> an <OK: status> for the <queue>. |
|
0 commit comments