From a37caa07fef4f69e53b3d522983d0b53e0e8bab3 Mon Sep 17 00:00:00 2001 From: koetsier Date: Tue, 24 Mar 2026 11:54:40 +0000 Subject: [PATCH 1/8] Bump the Elasticsearch client gem to version 7.10.x. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We’re planning an upcoming upgrade of our cluster to 7.10, so this change allows us to adopt the newer client early and ensure the codebase remains compatible. --- Gemfile | 2 +- Gemfile.lock | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Gemfile b/Gemfile index 1afbacea1..6d9913212 100644 --- a/Gemfile +++ b/Gemfile @@ -3,7 +3,7 @@ source "https://rubygems.org" gem "activesupport" gem "aws-sdk-s3" gem "bootsnap", require: false -gem "elasticsearch", "~> 6" # We need a 6.x release to interface with Elasticsearch 6 +gem "elasticsearch", "~> 7.10.0", "< 7.11" # We need a 6.x release to interface with Elasticsearch 6 gem "gds-api-adapters" gem "google-analytics-data-v1beta" gem "google-api-client" diff --git a/Gemfile.lock b/Gemfile.lock index feea68cf1..cfe08f9fb 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -82,12 +82,12 @@ GEM docile (1.4.0) domain_name (0.6.20240107) drb (2.2.3) - elasticsearch (6.8.3) - elasticsearch-api (= 6.8.3) - elasticsearch-transport (= 6.8.3) - elasticsearch-api (6.8.3) + elasticsearch (7.10.1) + elasticsearch-api (= 7.10.1) + elasticsearch-transport (= 7.10.1) + elasticsearch-api (7.10.1) multi_json - elasticsearch-transport (6.8.3) + elasticsearch-transport (7.10.1) faraday (~> 1) multi_json erb (6.0.2) @@ -116,7 +116,7 @@ GEM faraday-net_http_persistent (1.2.0) faraday-patron (1.0.0) faraday-rack (1.0.0) - faraday-retry (1.0.3) + faraday-retry (1.0.4) ffi (1.15.5) find_a_port (1.0.1) gapic-common (1.2.0) @@ -761,7 +761,7 @@ DEPENDENCIES bootsnap bunny-mock climate_control - elasticsearch (~> 6) + elasticsearch (~> 7.10.0, < 7.11) gds-api-adapters google-analytics-data-v1beta google-api-client From a10247318b77d3051347f6c21b77cfb3006e64b8 Mon Sep 17 00:00:00 2001 From: koetsier Date: Tue, 24 Mar 2026 11:55:45 +0000 Subject: [PATCH 2/8] Remove the standard token filter. This filter is deprecated, as it no longer provides any functional behavior. https://stackoverflow.com/questions/76108163/getting-error-the-standard-token-filter-has-been-removed-when-running-a-que --- config/schema/elasticsearch_schema.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/config/schema/elasticsearch_schema.yml b/config/schema/elasticsearch_schema.yml index bd0e4b829..d7fb35b5f 100644 --- a/config/schema/elasticsearch_schema.yml +++ b/config/schema/elasticsearch_schema.yml @@ -14,19 +14,19 @@ index: default: type: custom tokenizer: standard - filter: [standard, asciifolding, lowercase, stop, stemmer_override, stemmer_english] + filter: [asciifolding, lowercase, stop, stemmer_override, stemmer_english] char_filter: [normalize_quotes, strip_quotes] # Analyzer used at query time for old-style shingle matching. shingled_query_analyzer: type: custom tokenizer: standard - filter: [standard, asciifolding, lowercase, stemmer_override, stemmer_english, bigrams] + filter: [asciifolding, lowercase, stemmer_override, stemmer_english, bigrams] with_shingles: type: custom tokenizer: standard - filter: [standard, asciifolding, lowercase, stemmer_override, stemmer_english, bigrams] + filter: [asciifolding, lowercase, stemmer_override, stemmer_english, bigrams] char_filter: [normalize_quotes, strip_quotes] # This analyzer does not filter out these stopwords: @@ -38,7 +38,7 @@ index: searchable_text: type: custom tokenizer: standard - filter: [standard, asciifolding, lowercase, stemmer_override, stemmer_english] + filter: [asciifolding, lowercase, stemmer_override, stemmer_english] char_filter: [normalize_quotes, strip_quotes] # Analyzer used at index time for the .synonym variants of searchable @@ -46,7 +46,7 @@ index: with_index_synonyms: type: custom tokenizer: standard - filter: [standard, asciifolding, lowercase, index_synonym, stop, stemmer_override, stemmer_english] + filter: [asciifolding, lowercase, index_synonym, stop, stemmer_override, stemmer_english] char_filter: [normalize_quotes, strip_quotes] # Analyzer used at search time for the .synonym variants of searchable @@ -54,7 +54,7 @@ index: with_search_synonyms: type: custom tokenizer: standard - filter: [standard, asciifolding, lowercase, search_synonym, stop, stemmer_override, stemmer_english] + filter: [asciifolding, lowercase, search_synonym, stop, stemmer_override, stemmer_english] char_filter: [normalize_quotes, strip_quotes] # An analyzer for doing "exact" word matching (but stripping wrapping whitespace, and case insensitive). @@ -68,14 +68,14 @@ index: best_bet_stemmed_match: type: custom tokenizer: standard - filter: [standard, asciifolding, lowercase, stemmer_override, stemmer_english] + filter: [asciifolding, lowercase, stemmer_override, stemmer_english] char_filter: [normalize_quotes, strip_quotes] # Analyzer used to process text supplied to the field for use in spelling correction. spelling_analyzer: type: custom tokenizer: standard - filter: [standard, asciifolding, lowercase, shingle] + filter: [asciifolding, lowercase, shingle] char_filter: [normalize_quotes, strip_quotes] # Analyzer used to process text fields for use for sorting. From e44e4262210313814b358e86ac1a6013f536a887 Mon Sep 17 00:00:00 2001 From: koetsier Date: Tue, 24 Mar 2026 14:29:43 +0000 Subject: [PATCH 3/8] Expect POST requests when performing searches with the Elasticsearch 7.10 client. The Elasticsearch client gem has been updated and now issues POST requests for search operations instead of GET. --- spec/unit/elasticsearch_index_spec.rb | 16 ++++++++-------- spec/unit/time_based_index_cleanup_spec.rb | 2 +- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/spec/unit/elasticsearch_index_spec.rb b/spec/unit/elasticsearch_index_spec.rb index 28079cb56..d53657e9b 100644 --- a/spec/unit/elasticsearch_index_spec.rb +++ b/spec/unit/elasticsearch_index_spec.rb @@ -84,7 +84,7 @@ end it "can be searched" do - stub_get = stub_request(:get, "http://example.com:9200/government_test/generic-document/_search").with( + stub_get = stub_request(:post, "http://example.com:9200/government_test/generic-document/_search").with( body: %r{"query":"keyword search"}, ).to_return( body: '{"hits":{"hits":[]}}', @@ -110,7 +110,7 @@ it "can fetch documents by format" do search_pattern = "http://example.com:9200/government_test/_search?scroll=1m&search_type=query_then_fetch&size=500&version=true" - stub_request(:get, search_pattern).with( + stub_request(:post, search_pattern).with( body: { query: { term: { format: "organisation" } }, _source: { includes: %w[title link] }, sort: %w[_doc] }, ).to_return( body: { _scroll_id: "abcdefgh", hits: { total: 10, hits: [] } }.to_json, @@ -135,7 +135,7 @@ it "can fetch documents by format with certain fields" do search_pattern = "http://example.com:9200/government_test/_search?scroll=1m&search_type=query_then_fetch&size=500&version=true" - stub_request(:get, search_pattern).with( + stub_request(:post, search_pattern).with( body: "{\"query\":{\"term\":{\"format\":\"organisation\"}},\"_source\":{\"includes\":[\"title\",\"link\"]},\"sort\":[\"_doc\"]}", ).to_return( body: { _scroll_id: "abcdefgh", hits: { total: 10, hits: [] } }.to_json, @@ -160,7 +160,7 @@ it "can count the documents without retrieving them all" do search_pattern = "http://example.com:9200/government_test/_search?scroll=1m&search_type=query_then_fetch&size=50&version=true" - stub_request(:get, search_pattern).with( + stub_request(:post, search_pattern).with( body: { query: expected_all_documents_query, sort: %w[_doc] }.to_json, ).to_return( body: { _scroll_id: "abcdefgh", hits: { total: 100 } }.to_json, @@ -172,7 +172,7 @@ it "can retrieve all documents" do search_uri = "http://example.com:9200/government_test/_search?scroll=1m&search_type=query_then_fetch&size=50&version=true" - stub_request(:get, search_uri).with( + stub_request(:post, search_uri).with( body: { query: expected_all_documents_query, sort: %w[_doc] }.to_json, ).to_return( body: { _scroll_id: "abcdefgh", hits: { total: 100, hits: [] } }.to_json, @@ -202,7 +202,7 @@ allow(described_class).to receive(:scroll_batch_size).and_return(2) - stub_request(:get, search_uri).with( + stub_request(:post, search_uri).with( body: { query: expected_all_documents_query, sort: %w[_doc] }.to_json, ).to_return( body: { _scroll_id: "abcdefgh", hits: { total: 3, hits: [] } }.to_json, @@ -277,7 +277,7 @@ def build_government_index def stub_popularity_index_requests(paths, popularity, total_pages = 10, total_requested = total_pages, paths_to_return = paths) # stub the request for total results - stub_request(:get, "http://example.com:9200/page-traffic_test/generic-document/_search") + stub_request(:post, "http://example.com:9200/page-traffic_test/generic-document/_search") .with(body: { "query" => { "match_all" => {} }, "size" => 0 }.to_json) .to_return( body: { "hits" => { "total" => total_pages } }.to_json, @@ -312,7 +312,7 @@ def stub_popularity_index_requests(paths, popularity, total_pages = 10, total_re }, } - stub_request(:get, "http://example.com:9200/page-traffic_test/generic-document/_search") + stub_request(:post, "http://example.com:9200/page-traffic_test/generic-document/_search") .with(body: expected_query.to_json) .to_return( body: response.to_json, diff --git a/spec/unit/time_based_index_cleanup_spec.rb b/spec/unit/time_based_index_cleanup_spec.rb index b8aab8275..fc6089ced 100644 --- a/spec/unit/time_based_index_cleanup_spec.rb +++ b/spec/unit/time_based_index_cleanup_spec.rb @@ -152,7 +152,7 @@ }, } - stub_request(:get, %r{#{base_uri}/test(.*?)/_search}) + stub_request(:post, %r{#{base_uri}/test(.*?)/_search}) .with( body: expected_timed_delete_body, ).to_return( From 1f9231097db6d383af6e224f2b413efaee01fd2c Mon Sep 17 00:00:00 2001 From: koetsier Date: Tue, 24 Mar 2026 14:33:10 +0000 Subject: [PATCH 4/8] Update expectations for mapping calls made to the Elasticsearch client. When updating a mapping, the Elasticsearch client may now use either of the following endpoint formats: $HOSTNAME/{index}/_mapping/{type} $HOSTNAME/{index}/{type}/_mappings Both URL patterns are supported by Elasticsearch 6.8. --- spec/unit/index_spec.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/spec/unit/index_spec.rb b/spec/unit/index_spec.rb index 9e1242129..60c91f7f4 100644 --- a/spec/unit/index_spec.rb +++ b/spec/unit/index_spec.rb @@ -16,7 +16,7 @@ }, }, } - stub = stub_request(:put, %r{#{base_uri}/govuk-abc/_mapping/generic-document}) + stub = stub_request(:put, %r{#{base_uri}/govuk-abc/generic-document/_mappings}) .with(body: mappings["generic-document"]) .to_return({ status: 200, @@ -28,7 +28,7 @@ "type" => "illegal_argument_exception", "reason" => "invalid mapping", } }.to_json - failing_stub = stub_request(:put, %r{#{base_uri}/govuk-abc/_mapping/failing-document}) + failing_stub = stub_request(:put, %r{#{base_uri}/govuk-abc/failing-document/_mappings}) .with(body: mappings["failing-document"]) .to_return({ status: 400, From 15dc31332b7e31276ce9482092f177a369583bfd Mon Sep 17 00:00:00 2001 From: koetsier Date: Tue, 24 Mar 2026 14:40:02 +0000 Subject: [PATCH 5/8] Update expectations for scroll search calls made to the Elasticsearch client. When performing a scroll search, the Elasticsearch client may now use either of the following endpoint formats: $HOSTNAME/_search/scroll/{scroll_id}?scroll=1m $HOSTNAME/_search/scroll?scroll=1m&scroll_id={scroll_id} Both URL patterns are supported by Elasticsearch 6.8. --- spec/unit/elasticsearch_index_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/unit/elasticsearch_index_spec.rb b/spec/unit/elasticsearch_index_spec.rb index d53657e9b..f17335571 100644 --- a/spec/unit/elasticsearch_index_spec.rb +++ b/spec/unit/elasticsearch_index_spec.rb @@ -259,7 +259,7 @@ private def scroll_uri(scroll_id) - "http://example.com:9200/_search/scroll?scroll=1m&scroll_id=#{scroll_id}" + "http://example.com:9200/_search/scroll/#{scroll_id}?scroll=1m" end def scroll_response_body(scroll_id, total_results, results) From ef6f24c7adbc680fbd58378e3a99fd9b004339e7 Mon Sep 17 00:00:00 2001 From: koetsier Date: Tue, 24 Mar 2026 15:08:49 +0000 Subject: [PATCH 6/8] Replace _uid references with _id Although _uid is still present in Elasticsearch 6.8, it is deprecated. All usages have been updated to _id to ensure forward compatibility with future Elasticsearch versions. Reference: https://www.elastic.co/guide/en/elasticsearch/reference/6.8/mapping-uid-field.html --- lib/indexer/compare_enumerator.rb | 4 ++-- lib/tasks/export.rake | 2 +- spec/integration/scroll_enumerator_spec.rb | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/indexer/compare_enumerator.rb b/lib/indexer/compare_enumerator.rb index e4f4b2aa6..c5381348a 100644 --- a/lib/indexer/compare_enumerator.rb +++ b/lib/indexer/compare_enumerator.rb @@ -3,11 +3,11 @@ class CompareEnumerator < Enumerator NO_VALUE = :__no_value_found__ BATCH_SIZE = 250 DEFAULT_QUERY = { match_all: {} }.freeze - # sort by the document_type then the _uid, which is "type#id" - + # sort by the document_type then the _id, which is "type#id" - # sorting on the id directly is not possible, and the type will # always be "generic-document", which is why we first need to sort # by document_type. - DEFAULT_SORT = %i[document_type _uid].freeze + DEFAULT_SORT = %i[document_type _id].freeze def initialize(left_index_name, right_index_name, cluster = Clusters.default_cluster, search_body = {}, options = {}) @cluster = cluster diff --git a/lib/tasks/export.rake b/lib/tasks/export.rake index e60b6f041..881d31c99 100644 --- a/lib/tasks/export.rake +++ b/lib/tasks/export.rake @@ -10,7 +10,7 @@ namespace :export do .transform_values { |v| [v] } search_params = SearchConfig.parse_parameters(params) query = search_params.search_config.generate_query_for_params(search_params) - query[:sort] = %i[document_type _uid] + query[:sort] = %i[document_type _id] fields = search_params.return_fields.uniq base_uri = search_params.search_config.base_uri diff --git a/spec/integration/scroll_enumerator_spec.rb b/spec/integration/scroll_enumerator_spec.rb index 50cca3174..350920a85 100644 --- a/spec/integration/scroll_enumerator_spec.rb +++ b/spec/integration/scroll_enumerator_spec.rb @@ -24,7 +24,7 @@ results = ScrollEnumerator.new( client:, index_names: "govuk_test", - search_body: { query: { match_all: {} }, sort: [{ _uid: { order: "asc" } }] }, + search_body: { query: { match_all: {} }, sort: [{ _id: { order: "asc" } }] }, batch_size: 4, ) { |d| d } From 07c76afd011938277004442828cd417335794257 Mon Sep 17 00:00:00 2001 From: koetsier Date: Tue, 24 Mar 2026 21:01:34 +0000 Subject: [PATCH 7/8] Adjust popularity scoring to handle missing values Use a fallback when doc['popularity'] is empty by applying POPULARITY_OFFSET directly; otherwise add the offset to the existing value. This is necessary because Elasticsearch 7+ requires the property to exist if accessed popularity data is missing. --- lib/search/query_components/popularity.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/search/query_components/popularity.rb b/lib/search/query_components/popularity.rb index 568b80bf2..457038831 100644 --- a/lib/search/query_components/popularity.rb +++ b/lib/search/query_components/popularity.rb @@ -19,7 +19,7 @@ def default_popularity_boost(boosted_query) script_score: { script: { lang: "painless", - source: "doc['popularity'].value + #{POPULARITY_OFFSET}", + source: "doc['popularity'].size() == 0 ? #{POPULARITY_OFFSET} : doc['popularity'].value + #{POPULARITY_OFFSET}", }, }, }, From 8c926f4a488818cc545981d1d418e41f9dbb76dd Mon Sep 17 00:00:00 2001 From: koetsier Date: Tue, 24 Mar 2026 21:16:35 +0000 Subject: [PATCH 8/8] Flatten 'should' query in booster Elasticsearch 7+ does not allow nested 'should' queries --- lib/search/query_components/booster.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/search/query_components/booster.rb b/lib/search/query_components/booster.rb index 3701e029f..983934db9 100644 --- a/lib/search/query_components/booster.rb +++ b/lib/search/query_components/booster.rb @@ -12,7 +12,7 @@ def wrap(core_query) score_mode: :multiply, query: { bool: { - should: [core_query], + should: [core_query].flatten, }, }, functions: boost_filters,