Skip to content

Commit 056c969

Browse files
committed
Optimize code
1 parent 3c3e4e7 commit 056c969

4 files changed

Lines changed: 12 additions & 41 deletions

File tree

crawler.aro

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,7 @@
1313
<Extract> the <url> from the <event-data: url>.
1414
<Extract> the <base-domain> from the <event-data: base>.
1515

16-
<Log> "Extracted URL:" to the <console>.
17-
<Log> <url> to the <console>.
16+
<Log> "Extracted URL: ${<url>}" to the <console>.
1817

1918
(* Check if already crawled *)
2019
<Retrieve> the <crawled-urls> from the <crawled-repository>.
@@ -23,25 +22,22 @@
2322
<Compute> the <new-url-count: count> from <new-urls>.
2423

2524
(* Skip if already crawled - use match to check count *)
26-
<Log> "Checking URL..." to the <console>.
2725
match <new-url-count> {
2826
case 0 {
2927
<Return> an <OK: status> for the <skip>.
3028
}
3129
}
3230

33-
<Log> "Crawling:" to the <console>.
34-
<Log> <url> to the <console>.
31+
<Log> "Crawling: ${<url>}" to the <console>.
3532

3633
(* Mark URL as crawled before fetching to prevent duplicate requests *)
3734
<Compute> the <updated-crawled: union> from <crawled-urls> with <single-url-list>.
3835
<Store> the <updated-crawled> into the <crawled-repository>.
3936

4037
(* Fetch the page *)
4138
<Request> the <html> from the <url>.
42-
<Log> "Fetched HTML length:" to the <console>.
4339
<Compute> the <html-len: length> from the <html>.
44-
<Log> <html-len> to the <console>.
40+
<Log> "Fetched HTML length: ${<html-len>}" to the <console>.
4541

4642
(* Extract markdown content from HTML using ParseHtml action *)
4743
<ParseHtml> the <markdown-result: markdown> from the <html>.

links.aro

Lines changed: 7 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@
1616
(* Use ParseHtml action to extract all href attributes from anchor tags *)
1717
<ParseHtml> the <links: links> from the <html>.
1818
<Compute> the <link-count: count> from the <links>.
19-
<Log> "Found" to the <console>.
20-
<Log> <link-count> to the <console>.
21-
<Log> "links" to the <console>.
19+
<Log> "Found ${<link-count>} links" to the <console>.
2220

2321
(* Process each extracted link using parallel for *)
2422
parallel for each <raw-url> in <links> {
@@ -54,20 +52,8 @@
5452
<Create> the <absolute-url> with "${<base-domain>}${<raw-url>}".
5553
<Emit> a <FilterUrl: event> with { url: <absolute-url>, base: <base-domain> }.
5654
}
57-
case /^#/ {
58-
(* Fragment only - skip *)
59-
}
60-
case /^mailto:/ {
61-
(* Email link - skip *)
62-
}
63-
case /^javascript:/ {
64-
(* JavaScript link - skip *)
65-
}
66-
case /^tel:/ {
67-
(* Phone link - skip *)
68-
}
69-
case /^data:/ {
70-
(* Data URL - skip *)
55+
case /^(#|mailto:|javascript:|tel:|data:)/ {
56+
(* Skip fragments and special URLs *)
7157
}
7258
}
7359

@@ -80,16 +66,9 @@
8066
<Extract> the <url> from the <event-data: url>.
8167
<Extract> the <base-domain> from the <event-data: base>.
8268

83-
(* Simple check: URL must start with http:// or https:// followed by same domain *)
84-
(* For now, just pass all URLs through since they've been normalized *)
85-
(* The QueueUrl handler will filter duplicates *)
86-
match <url> {
87-
case /^https?:\/\/ausdertechnik\.de/ {
88-
<Log> "Queuing:" to the <console>.
89-
<Log> <url> to the <console>.
90-
<Emit> a <QueueUrl: event> with { url: <url>, base: <base-domain> }.
91-
}
92-
}
69+
(* Filter URLs that belong to the same domain as base-domain *)
70+
<Log> "Queuing: ${<url>}" to the <console> when <url> contains <base-domain>.
71+
<Emit> a <QueueUrl: event> with { url: <url>, base: <base-domain> } when <url> contains <base-domain>.
9372

9473
<Return> an <OK: status> for the <filter>.
9574
}
@@ -107,8 +86,7 @@
10786
<Compute> the <uncrawled-count: count> from <uncrawled-urls>.
10887

10988
(* Only queue if not already crawled *)
110-
<Log> "Queued:" to the <console> when <uncrawled-count> > 0.
111-
<Log> <url> to the <console> when <uncrawled-count> > 0.
89+
<Log> "Queued: ${<url>}" to the <console> when <uncrawled-count> > 0.
11290
<Emit> a <CrawlPage: event> with { url: <url>, base: <base-domain> } when <uncrawled-count> > 0.
11391

11492
<Return> an <OK: status> for the <queue>.

main.aro

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,8 +10,7 @@
1010
(* Read starting URL from environment *)
1111
<Extract> the <start-url> from the <env: CRAWL_URL>.
1212

13-
<Log> "Starting URL:" to the <console>.
14-
<Log> <start-url> to the <console>.
13+
<Log> "Starting URL: ${<start-url>}" to the <console>.
1514

1615
(* Create output directory *)
1716
<Create> the <output-path> with "./output".

storage.aro

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,7 @@
1616
<Compute> the <url-hash: hash> from the <url>.
1717
<Create> the <file-path> with "./output/${<url-hash>}.md".
1818

19-
<Log> "Saving:" to the <console>.
20-
<Log> <url> to the <console>.
21-
<Log> <file-path> to the <console>.
19+
<Log> "Saving: ${<url>} to ${<file-path>}" to the <console>.
2220

2321
(* Format markdown file with frontmatter *)
2422
<Create> the <file-content> with "# ${<title>}\n\n**Source:** ${<url>}\n\n---\n\n${<content>}".

0 commit comments

Comments
 (0)