From 5ff93437a4cf0301af5eb2299234655fa9602d53 Mon Sep 17 00:00:00 2001
From: Sean Story <sean.story@elastic.co>
Date: Fri, 6 Feb 2026 15:48:20 -0500
Subject: [PATCH] Fix excluded tags lookup to use correct key type (#417)

### Closes https://github.com/elastic/crawler/issues/416

The `exclude_tags` configuration was not being applied correctly. The
config stores exclude_tags keyed by domain URL strings (e.g.,
`"https://example.com"`), but the lookup in `get_body_tag` was using the
URL object directly as the hash key instead of `url.site`.

This fix changes the lookup to use `url.site` (which returns the scheme
+ host as a string) to match how the config stores the keys.

### Checklists

#### Pre-Review Checklist
- [x] This PR does NOT contain credentials of any kind, such as API keys
or username/passwords (double check `crawler.yml.example` and
`elasticsearch.yml.example`)
- [x] This PR has a meaningful title
- [x] This PR links to all relevant GitHub issues that it fixes or
partially addresses
    - Fixes #416
- [x] this PR has a thorough description
- [x] Covered the changes with automated tests
- [ ] Tested the changes locally
- [x] Added a label for each target release version (example: `v0.1.0`)
- [x] Considered corresponding documentation changes
    - N/A - this is a bug fix, no documentation changes needed
- [x] Contributed any configuration settings changes to the
configuration reference
    - N/A - no configuration changes
- [x] Ran `make notice` if any dependencies have been added
    - N/A - no dependencies added

#### Changes Requiring Extra Attention

N/A - This is a straightforward bug fix with no security implications or
new dependencies.

### Release Note

Fixed `exclude_tags` domain configuration not being applied during
crawl. Tags specified in `exclude_tags` for a domain are now correctly
excluded from the document body.
---
 lib/crawler/data/crawl_result/html.rb         |  5 ++-
 .../crawler/data/crawl_result/html_spec.rb    | 31 ++++++++++++++++++-
 2 files changed, 34 insertions(+), 2 deletions(-)
diff --git a/lib/crawler/data/crawl_result/html.rb b/lib/crawler/data/crawl_result/html.rb
index a441e497..0fe062c8 100644
--- a/lib/crawler/data/crawl_result/html.rb
+++ b/lib/crawler/data/crawl_result/html.rb
@@ -219,7 +219,10 @@ def headings(limit: 10)
 
         def get_body_tag(exclude_tags)
           exclude_tags ||= {}
-          tags_to_exclude_for_domain = exclude_tags.fetch(url, [])
+          # Config stores exclude_tags keyed by domain URL (e.g., "https://example.com")
+          # Try site first (scheme + host), which is the standard format
+          tags_to_exclude_for_domain = exclude_tags.fetch(url.site, nil) ||
+                                       exclude_tags.fetch(url.to_s, [])
 
           if tags_to_exclude_for_domain.empty?
             parsed_content.body
diff --git a/spec/lib/crawler/data/crawl_result/html_spec.rb b/spec/lib/crawler/data/crawl_result/html_spec.rb
index 693c143f..12b5d943 100644
--- a/spec/lib/crawler/data/crawl_result/html_spec.rb
+++ b/spec/lib/crawler/data/crawl_result/html_spec.rb
@@ -329,13 +329,42 @@
     end
 
     context 'when given a list of tags to exclude' do
-      let(:body_text) { crawl_result.document_body(exclude_tags: { url => ['h1'] }) }
+      # Keys must be site strings (scheme + host) to match how config stores them
+      let(:body_text) { crawl_result.document_body(exclude_tags: { url.site => ['h1'] }) }
 
       it 'should remove content associated with those tags, even if there is a data-elastic-include' do
         expect(body_text).to_not match('Page header')
       end
     end
 
+    context 'when given a list of tags to exclude with string keys (as from config)' do
+      let(:html) do
+        <<~HTML
+          <html>
+          <body>
+            <header>HEADER TEXT Should not be indexed</header>
+            <h2>title</h2>
+            <p>BODY content</p>
+            <address>main street 123 to be ignored too</address>
+            <footer>FOOTER TEXT</footer>
+          </body>
+          </html>
+        HTML
+      end
+
+      # This reproduces the bug from https://github.com/elastic/crawler/issues/416
+      # The config stores exclude_tags with string keys (site URLs like "https://example.com"),
+      # so the lookup must use url.site to match the config format
+      let(:body_text) { crawl_result.document_body(exclude_tags: { url.site => %w[header address] }) }
+
+      it 'should remove content associated with those tags when keys are strings' do
+        expect(body_text).to_not match('HEADER TEXT')
+        expect(body_text).to_not match('main street')
+        expect(body_text).to match('BODY content')
+        expect(body_text).to match('FOOTER TEXT')
+      end
+    end
+
     it 'should remove empty spaces from the content' do
       expect(body_text).to match('Something something else')
     end