From 5ff93437a4cf0301af5eb2299234655fa9602d53 Mon Sep 17 00:00:00 2001 From: Sean Story Date: Fri, 6 Feb 2026 15:48:20 -0500 Subject: [PATCH] Fix excluded tags lookup to use correct key type (#417) ### Closes https://github.com/elastic/crawler/issues/416 The `exclude_tags` configuration was not being applied correctly. The config stores exclude_tags keyed by domain URL strings (e.g., `"https://example.com"`), but the lookup in `get_body_tag` was using the URL object directly as the hash key instead of `url.site`. This fix changes the lookup to use `url.site` (which returns the scheme + host as a string) to match how the config stores the keys. ### Checklists #### Pre-Review Checklist - [x] This PR does NOT contain credentials of any kind, such as API keys or username/passwords (double check `crawler.yml.example` and `elasticsearch.yml.example`) - [x] This PR has a meaningful title - [x] This PR links to all relevant GitHub issues that it fixes or partially addresses - Fixes #416 - [x] this PR has a thorough description - [x] Covered the changes with automated tests - [ ] Tested the changes locally - [x] Added a label for each target release version (example: `v0.1.0`) - [x] Considered corresponding documentation changes - N/A - this is a bug fix, no documentation changes needed - [x] Contributed any configuration settings changes to the configuration reference - N/A - no configuration changes - [x] Ran `make notice` if any dependencies have been added - N/A - no dependencies added #### Changes Requiring Extra Attention N/A - This is a straightforward bug fix with no security implications or new dependencies. ### Release Note Fixed `exclude_tags` domain configuration not being applied during crawl. Tags specified in `exclude_tags` for a domain are now correctly excluded from the document body. --- lib/crawler/data/crawl_result/html.rb | 5 ++- .../crawler/data/crawl_result/html_spec.rb | 31 ++++++++++++++++++- 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/lib/crawler/data/crawl_result/html.rb b/lib/crawler/data/crawl_result/html.rb index a441e497..0fe062c8 100644 --- a/lib/crawler/data/crawl_result/html.rb +++ b/lib/crawler/data/crawl_result/html.rb @@ -219,7 +219,10 @@ def headings(limit: 10) def get_body_tag(exclude_tags) exclude_tags ||= {} - tags_to_exclude_for_domain = exclude_tags.fetch(url, []) + # Config stores exclude_tags keyed by domain URL (e.g., "https://example.com") + # Try site first (scheme + host), which is the standard format + tags_to_exclude_for_domain = exclude_tags.fetch(url.site, nil) || + exclude_tags.fetch(url.to_s, []) if tags_to_exclude_for_domain.empty? parsed_content.body diff --git a/spec/lib/crawler/data/crawl_result/html_spec.rb b/spec/lib/crawler/data/crawl_result/html_spec.rb index 693c143f..12b5d943 100644 --- a/spec/lib/crawler/data/crawl_result/html_spec.rb +++ b/spec/lib/crawler/data/crawl_result/html_spec.rb @@ -329,13 +329,42 @@ end context 'when given a list of tags to exclude' do - let(:body_text) { crawl_result.document_body(exclude_tags: { url => ['h1'] }) } + # Keys must be site strings (scheme + host) to match how config stores them + let(:body_text) { crawl_result.document_body(exclude_tags: { url.site => ['h1'] }) } it 'should remove content associated with those tags, even if there is a data-elastic-include' do expect(body_text).to_not match('Page header') end end + context 'when given a list of tags to exclude with string keys (as from config)' do + let(:html) do + <<~HTML + + +
HEADER TEXT Should not be indexed
+

title

+

BODY content

+
main street 123 to be ignored too
+ + + + HTML + end + + # This reproduces the bug from https://github.com/elastic/crawler/issues/416 + # The config stores exclude_tags with string keys (site URLs like "https://example.com"), + # so the lookup must use url.site to match the config format + let(:body_text) { crawl_result.document_body(exclude_tags: { url.site => %w[header address] }) } + + it 'should remove content associated with those tags when keys are strings' do + expect(body_text).to_not match('HEADER TEXT') + expect(body_text).to_not match('main street') + expect(body_text).to match('BODY content') + expect(body_text).to match('FOOTER TEXT') + end + end + it 'should remove empty spaces from the content' do expect(body_text).to match('Something something else') end