diff --git a/CHANGELOG.md b/CHANGELOG.md index 7af22823af..d1679263b4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,9 @@ app. - Added support for Daikanwajiten (Morohashi) references ([#2734](https://github.com/birchill/10ten-ja-reader/pull/2734)). - Worked around a [major string substitution bug in Safari](https://bugs.webkit.org/show_bug.cgi?id=306492). +- Fixed word lookups splitting ruby (``) text incorrectly, while still + allowing splits around center dots (`・`) + ([#2785](https://github.com/birchill/10ten-ja-reader/issues/2785)). ## [1.26.1] - 2025-12-23 diff --git a/src/background/background-message.ts b/src/background/background-message.ts index 0eff63c3cc..f8af239ed3 100644 --- a/src/background/background-message.ts +++ b/src/background/background-message.ts @@ -16,6 +16,8 @@ const SourceContextSchema = s.object({ inTranscription: s.optional(s.boolean()), }); +const IndivisibleRangeSchema = s.tuple([s.number(), s.number()]); + export const BackgroundMessageSchema = discriminator('type', { disable: s.type({ frame: s.literal('*') }), enable: s.type({ @@ -53,6 +55,7 @@ export const BackgroundMessageSchema = discriminator('type', { targetProps: s.type({}), text: s.string(), wordLookup: s.boolean(), + indivisibleRanges: s.optional(s.array(IndivisibleRangeSchema)), // Parameters for designating the iframe source source: s.type({ frameId: s.number(), diff --git a/src/background/background-request.ts b/src/background/background-request.ts index 72c2aeadee..958080f62a 100644 --- a/src/background/background-request.ts +++ b/src/background/background-request.ts @@ -3,7 +3,12 @@ import * as s from 'superstruct'; import { PopupStateSchema } from '../content/popup-state'; -const SearchRequestSchema = s.type({ input: s.string() }); +const IndivisibleRangeSchema = s.tuple([s.number(), s.number()]); + +const SearchRequestSchema = s.type({ + input: s.string(), + indivisibleRanges: s.optional(s.array(IndivisibleRangeSchema)), +}); export type SearchRequest = s.Infer; diff --git a/src/background/background.ts b/src/background/background.ts index d3e343ba00..9c707ff3f8 100644 --- a/src/background/background.ts +++ b/src/background/background.ts @@ -396,13 +396,18 @@ function notifyDbListeners(specifiedListener?: Runtime.Port) { async function searchWords({ input, + indivisibleRanges, abortSignal, }: SearchRequest & { abortSignal: AbortSignal; }): Promise { await dbReady; - const [words, dbStatus] = await jpdictSearchWords({ abortSignal, input }); + const [words, dbStatus] = await jpdictSearchWords({ + abortSignal, + input, + indivisibleRanges, + }); return { words, dbStatus }; } diff --git a/src/background/jpdict.ts b/src/background/jpdict.ts index a8eed1a0ee..d29672414e 100644 --- a/src/background/jpdict.ts +++ b/src/background/jpdict.ts @@ -15,6 +15,7 @@ import { import { kanaToHiragana } from '@birchill/normal-jp'; import browser from 'webextension-polyfill'; +import type { IndivisibleRanges } from '../common/indivisible-range'; import { MAX_LOOKUP_LENGTH, MAX_TRANSLATE_INPUT_LENGTH, @@ -306,10 +307,12 @@ const WORDS_MAX_ENTRIES = 7; export async function searchWords({ input, + indivisibleRanges, abortSignal, max = 0, }: { input: string; + indivisibleRanges?: IndivisibleRanges; abortSignal?: AbortSignal; max?: number; }): Promise< @@ -348,6 +351,7 @@ export async function searchWords({ getWords, input: word, inputLengths, + indivisibleRanges, maxResults, }), dbStatus !== 'ok' ? dbStatus : undefined, diff --git a/src/background/word-search.test.ts b/src/background/word-search.test.ts new file mode 100644 index 0000000000..5261e7e62e --- /dev/null +++ b/src/background/word-search.test.ts @@ -0,0 +1,130 @@ +import { describe, expect, it } from 'vitest'; + +import { normalizeInput } from '../utils/normalize'; + +import type { DictionaryWordResult } from './search-result'; +import { wordSearch } from './word-search'; + +describe('wordSearch', () => { + it('does not split inside indivisible ranges while shortening', async () => { + const input = 'けんせいしてたすけに'; + const [normalized, inputLengths] = normalizeInput(input); + const lookups: Array = []; + + const result = await wordSearch({ + getWords: async ({ input }) => { + lookups.push(input); + return input === 'けんせいして' + ? [makeWordResult({ id: 1, reading: input })] + : []; + }, + input: normalized, + inputLengths, + indivisibleRanges: [[6, 8]], + maxResults: 10, + }); + + expect(result?.matchLen).toBe(6); + expect(lookups).toContain('けんせいして'); + expect(lookups).not.toContain('けんせいしてた'); + }); + + it('allows shortening at center-dot boundaries between indivisible ranges', async () => { + const input = 'あ・い・う'; + const [normalized, inputLengths] = normalizeInput(input); + const lookups: Array = []; + + const result = await wordSearch({ + getWords: async ({ input }) => { + lookups.push(input); + return input === 'あ・い' + ? [makeWordResult({ id: 2, reading: input })] + : []; + }, + input: normalized, + inputLengths, + indivisibleRanges: [ + [0, 1], + [2, 3], + [4, 5], + ], + maxResults: 10, + }); + + expect(result?.matchLen).toBe(3); + expect(lookups).toContain('あ・い'); + }); + + it('does not split a trailing yoon when shortening', async () => { + const [normalized, inputLengths] = normalizeInput('きゃ'); + const lookups: Array = []; + + await wordSearch({ + getWords: async ({ input }) => { + lookups.push(input); + return []; + }, + input: normalized, + inputLengths, + maxResults: 10, + }); + + expect(lookups).toContain('きゃ'); + expect(lookups).not.toContain('き'); + }); + + it('tries choon-expanded variants', async () => { + const [normalized, inputLengths] = normalizeInput('そーゆー'); + const lookups: Array = []; + + const result = await wordSearch({ + getWords: async ({ input }) => { + lookups.push(input); + return input === 'そうゆう' + ? [makeWordResult({ id: 3, reading: input })] + : []; + }, + input: normalized, + inputLengths, + maxResults: 10, + }); + + expect(result?.matchLen).toBe(4); + expect(lookups).toContain('そうゆう'); + }); + + it('tries 旧字体 to 新字体 variants', async () => { + const [normalized, inputLengths] = normalizeInput('國語'); + const lookups: Array = []; + + const result = await wordSearch({ + getWords: async ({ input }) => { + lookups.push(input); + return input === '国語' + ? [makeWordResult({ id: 4, reading: input })] + : []; + }, + input: normalized, + inputLengths, + maxResults: 10, + }); + + expect(result?.matchLen).toBe(2); + expect(lookups).toContain('国語'); + }); +}); + +function makeWordResult({ + id, + reading, +}: { + id: number; + reading: string; +}): DictionaryWordResult { + return { + id, + k: [], + r: [{ ent: reading, app: 0, matchRange: [0, reading.length] }], + s: [{ g: ['test'], match: true }], + } as unknown as DictionaryWordResult; +} diff --git a/src/background/word-search.ts b/src/background/word-search.ts index d88b3c192d..5f756c9bfd 100644 --- a/src/background/word-search.ts +++ b/src/background/word-search.ts @@ -2,6 +2,7 @@ import type { PartOfSpeech } from '@birchill/jpdict-idb'; import { AbortError } from '@birchill/jpdict-idb'; import { expandChoon, kyuujitaiToShinjitai } from '@birchill/normal-jp'; +import type { IndivisibleRanges } from '../common/indivisible-range'; import { isOnlyDigits } from '../utils/char-range'; import { toRomaji } from '../utils/romaji'; @@ -26,12 +27,14 @@ export async function wordSearch({ getWords, input, inputLengths, + indivisibleRanges, maxResults, }: { abortSignal?: AbortSignal; getWords: GetWordsFunction; input: string; inputLengths: Array; + indivisibleRanges?: IndivisibleRanges; maxResults: number; }): Promise { let longestMatch = 0; @@ -119,9 +122,22 @@ export async function wordSearch({ break; } - // Shorten input, but don't split a ようおん (e.g. きゃ). - const lengthToShorten = endsInYoon(input) ? 2 : 1; - input = input.substring(0, input.length - lengthToShorten); + // Shorten input, but don't split a ようおん (e.g. きゃ), and don't split + // any caller-provided indivisible segments (e.g. ruby text). + let nextInputLength = input.length - (endsInYoon(input) ? 2 : 1); + while (nextInputLength > 0) { + const nextMatchLength = inputLengths[nextInputLength]; + if ( + typeof nextMatchLength !== 'number' || + !isInIndivisibleRange(nextMatchLength, indivisibleRanges) + ) { + break; + } + nextInputLength -= endsInYoon(input.substring(0, nextInputLength)) + ? 2 + : 1; + } + input = input.substring(0, Math.max(nextInputLength, 0)); } if (!result.data.length) { @@ -132,6 +148,15 @@ export async function wordSearch({ return result; } +function isInIndivisibleRange( + offset: number, + indivisibleRanges: IndivisibleRanges | undefined +): boolean { + return !!indivisibleRanges?.some( + ([start, end]) => offset > start && offset < end + ); +} + async function lookupCandidates({ abortSignal, existingEntries, diff --git a/src/common/indivisible-range.ts b/src/common/indivisible-range.ts new file mode 100644 index 0000000000..b0d35ac4fa --- /dev/null +++ b/src/common/indivisible-range.ts @@ -0,0 +1,3 @@ +export type IndivisibleRange = [start: number, end: number]; + +export type IndivisibleRanges = Array; diff --git a/src/content/content.ts b/src/content/content.ts index 1a54023bed..7f438fd1f0 100644 --- a/src/content/content.ts +++ b/src/content/content.ts @@ -57,6 +57,7 @@ import type { } from '../common/content-config-params'; import type { CopyType } from '../common/copy-keys'; import { CopyKeys } from '../common/copy-keys'; +import type { IndivisibleRanges } from '../common/indivisible-range'; import { MAX_LOOKUP_LENGTH } from '../common/limits'; import { isEditableNode, isInteractiveElement } from '../utils/dom-utils'; import type { MarginBox, Point, Rect } from '../utils/geometry'; @@ -236,6 +237,7 @@ export class ContentHandler { | { text: string; wordLookup: boolean; + indivisibleRanges?: IndivisibleRanges; meta?: SelectionMeta; source: IframeSourceParams | null; sourceContext: SourceContext | null; @@ -1912,6 +1914,7 @@ export class ContentHandler { const lookupParams = { dictMode, + indivisibleRanges: textAtPoint.indivisibleRanges, meta: textAtPoint.meta, source: null, sourceContext: textAtPoint.sourceContext, @@ -1951,6 +1954,7 @@ export class ContentHandler { async lookupText({ dictMode, + indivisibleRanges, meta, source, sourceContext, @@ -1959,6 +1963,7 @@ export class ContentHandler { wordLookup, }: { dictMode: 'default' | 'kanji'; + indivisibleRanges?: IndivisibleRanges; meta?: SelectionMeta; source: IframeSourceParams | null; sourceContext: SourceContext | null; @@ -1970,6 +1975,7 @@ export class ContentHandler { text, meta, wordLookup, + indivisibleRanges, source, sourceContext, }; @@ -1981,11 +1987,13 @@ export class ContentHandler { this.isPopupExpanded = false; const queryResult = await query(text, { + indivisibleRanges, metaMatchLen: meta?.matchLen, wordLookup, updateQueryResult: (queryResult: QueryResult | null) => { void this.applyQueryResult({ dictMode, + indivisibleRanges, meta, queryResult, targetProps, @@ -1997,6 +2005,7 @@ export class ContentHandler { void this.applyQueryResult({ dictMode, + indivisibleRanges, meta, queryResult, targetProps, @@ -2007,6 +2016,7 @@ export class ContentHandler { async applyQueryResult({ dictMode, + indivisibleRanges, meta, queryResult, targetProps, @@ -2014,13 +2024,14 @@ export class ContentHandler { wordLookup, }: { dictMode: 'default' | 'kanji'; + indivisibleRanges?: IndivisibleRanges; meta?: SelectionMeta; queryResult: QueryResult | null; targetProps: TargetProps; text: string; wordLookup: boolean; }) { - const lookupParams = { text, meta, wordLookup }; + const lookupParams = { text, meta, wordLookup, indivisibleRanges }; // Check if we have triggered a new query or been disabled while running // the previous query. diff --git a/src/content/get-text.browser.test.ts b/src/content/get-text.browser.test.ts index 4001b24ed8..b3800d2c6d 100644 --- a/src/content/get-text.browser.test.ts +++ b/src/content/get-text.browser.test.ts @@ -814,6 +814,47 @@ describe('getTextAtPoint', () => { ); }); + it('marks rt text as indivisible when scanning across ruby and non-ruby text', () => { + // Arrange + testDiv.innerHTML = + '牽制けんせいしてたすけに'; + const firstRtNode = testDiv.firstChild!.childNodes[1].firstChild as Text; + const bbox = getBboxForOffset(firstRtNode, 0); + + // Act + const result = getTextAtPoint({ + point: { x: bbox.left + bbox.width / 2, y: bbox.top + bbox.height / 4 }, + }); + + // Assert + expect(result?.text).toBe('けんせいしてたすけに'); + expect(result?.indivisibleRanges).toEqual([ + [0, 4], + [6, 8], + ]); + }); + + it('splits indivisible ranges at center dots in rt text', () => { + // Arrange + testDiv.innerHTML = + '最高経営責任者シー・イー・オーです'; + const rtNode = testDiv.firstChild!.childNodes[1].firstChild as Text; + const bbox = getBboxForOffset(rtNode, 0); + + // Act + const result = getTextAtPoint({ + point: { x: bbox.left + bbox.width / 2, y: bbox.top + bbox.height / 4 }, + }); + + // Assert + expect(result?.text).toBe('シー・イー・オーです'); + expect(result?.indivisibleRanges).toEqual([ + [0, 2], + [3, 5], + [6, 8], + ]); + }); + it('traverses okurigana in inline-block elements too', () => { // Arrange diff --git a/src/content/query.ts b/src/content/query.ts index b7d9f2344a..c652ad7369 100644 --- a/src/content/query.ts +++ b/src/content/query.ts @@ -10,6 +10,7 @@ import type { TranslateResult, WordSearchResult, } from '../background/search-result'; +import type { IndivisibleRanges } from '../common/indivisible-range'; import { hasKatakana } from '../utils/char-range'; import { omit } from '../utils/omit'; @@ -27,6 +28,7 @@ export type QueryResult = { export type NamePreview = { names: Array; more: boolean }; export interface QueryOptions { + indivisibleRanges?: IndivisibleRanges; metaMatchLen?: number; wordLookup: boolean; updateQueryResult: (result: QueryResult | null) => void; @@ -124,10 +126,16 @@ async function queryWords( text: string, options: QueryOptions ): Promise { - const message: BackgroundRequest = { - type: options.wordLookup ? 'searchWords' : 'translate', - input: text, - }; + let message: BackgroundRequest; + if (options.wordLookup) { + message = { + type: 'searchWords', + input: text, + indivisibleRanges: options.indivisibleRanges, + }; + } else { + message = { type: 'translate', input: text }; + } let searchResult: SearchWordsResult | TranslateResult | 'aborted' | null; try { @@ -299,11 +307,17 @@ function addNamePreview(result: QueryResult): QueryResult { } function getCacheKey({ + indivisibleRanges, text, wordLookup, }: { + indivisibleRanges?: IndivisibleRanges; text: string; wordLookup: boolean; }): string { - return [text, wordLookup ? '1' : '0'].join('-'); + return [ + text, + wordLookup ? '1' : '0', + JSON.stringify(indivisibleRanges || []), + ].join('-'); } diff --git a/src/content/scan-text.ts b/src/content/scan-text.ts index 63e5cd841f..b8c6d623a8 100644 --- a/src/content/scan-text.ts +++ b/src/content/scan-text.ts @@ -1,3 +1,4 @@ +import type { IndivisibleRanges } from '../common/indivisible-range'; import { nonJapaneseChar } from '../utils/char-range'; import { normalizeContext } from '../utils/normalize'; @@ -35,6 +36,9 @@ export type ScanTextResult = { // Extra metadata we parsed in the process meta?: SelectionMeta; + + // Ranges in `text` that should not be split during lookup. + indivisibleRanges?: IndivisibleRanges; }; export function scanText({ @@ -204,6 +208,7 @@ export function scanText({ } const result: ScanTextResult = { text: '', textRange: [], sourceContext }; + const indivisibleRanges: IndivisibleRanges = []; let textDelimiter = nonJapaneseChar; @@ -251,12 +256,25 @@ export function scanText({ } else if (textEnd !== -1) { // The text node has disallowed characters mid-way through so // return up to that point. - result.text += nodeText.substring(0, textEnd); + const textToAppend = nodeText.substring(0, textEnd); + addIndivisibleRanges({ + indivisibleRanges, + node, + text: textToAppend, + outputOffset: result.text.length, + }); + result.text += textToAppend; result.textRange.push({ node, start: offset, end: offset + textEnd }); break; } // The whole text node is allowed characters, keep going. + addIndivisibleRanges({ + indivisibleRanges, + node, + text: nodeText, + outputOffset: result.text.length, + }); result.text += nodeText; result.textRange.push({ node, start: offset, end: node.data.length }); @@ -292,10 +310,61 @@ export function scanText({ trimSourceContext(sourceContext, result.text.length); result.meta = extractGetTextMetadata({ text: result.text, matchCurrency }); + if (indivisibleRanges.length) { + result.indivisibleRanges = indivisibleRanges; + } return result; } +function addIndivisibleRanges({ + indivisibleRanges, + node, + text, + outputOffset, +}: { + indivisibleRanges: IndivisibleRanges; + node: Text; + text: string; + outputOffset: number; +}) { + if (!text.length || !node.parentElement?.closest('rt')) { + return; + } + + // Treat all content in as indivisible except around center dots. + // + // That is, "あ・い" becomes two indivisible segments, "あ" and "い". + let segmentStart = 0; + while (segmentStart < text.length) { + const dotOffset = text.indexOf('・', segmentStart); + const segmentEnd = dotOffset === -1 ? text.length : dotOffset; + if (segmentEnd > segmentStart) { + pushIndivisibleRange(indivisibleRanges, [ + outputOffset + segmentStart, + outputOffset + segmentEnd, + ]); + } + if (dotOffset === -1) { + break; + } + segmentStart = dotOffset + 1; + } +} + +function pushIndivisibleRange( + indivisibleRanges: IndivisibleRanges, + range: [start: number, end: number] +) { + const lastRange = indivisibleRanges.at(-1); + if (!lastRange || lastRange[1] < range[0]) { + indivisibleRanges.push(range); + return; + } + + lastRange[1] = Math.max(lastRange[1], range[1]); +} + // ---------------------------------------------------------------------------- // // DOM helpers