Skip to content

Commit e699b4f

Browse files
committed
Make Reline::Unicode's vi_ ed_ em_ method encoding safe
1 parent 7f28396 commit e699b4f

File tree

2 files changed

+87
-50
lines changed

2 files changed

+87
-50
lines changed

lib/reline/unicode.rb

Lines changed: 51 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -263,50 +263,49 @@ def self.get_prev_mbchar_size(line, byte_pointer)
263263

264264
def self.em_forward_word(line, byte_pointer)
265265
gcs = line.byteslice(byte_pointer..).grapheme_clusters
266-
nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) }
267-
words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) }
266+
nonwords = gcs.take_while { |c| !word_character?(c) }
267+
words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) }
268268
nonwords.sum(&:bytesize) + words.sum(&:bytesize)
269269
end
270270

271271
def self.em_forward_word_with_capitalization(line, byte_pointer)
272272
gcs = line.byteslice(byte_pointer..).grapheme_clusters
273-
nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) }
274-
words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) }
273+
nonwords = gcs.take_while { |c| !word_character?(c) }
274+
words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) }
275275
[nonwords.sum(&:bytesize) + words.sum(&:bytesize), nonwords.join + words.join.capitalize]
276276
end
277277

278278
def self.em_backward_word(line, byte_pointer)
279279
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
280-
nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) }
281-
words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) }
280+
nonwords = gcs.take_while { |c| !word_character?(c) }
281+
words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) }
282282
nonwords.sum(&:bytesize) + words.sum(&:bytesize)
283283
end
284284

285285
def self.em_big_backward_word(line, byte_pointer)
286286
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
287-
spaces = gcs.take_while { |c| c.match?(/\s/) }
288-
nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) }
287+
spaces = gcs.take_while { |c| space_character?(c) }
288+
nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) }
289289
spaces.sum(&:bytesize) + nonspaces.sum(&:bytesize)
290290
end
291291

292292
def self.ed_transpose_words(line, byte_pointer)
293293
gcs = line.byteslice(0, byte_pointer).grapheme_clusters
294294
pos = gcs.size
295295
gcs += line.byteslice(byte_pointer..).grapheme_clusters
296-
gcs.map! { |c| c.encode(Encoding::UTF_8) }
297-
pos += 1 while pos < gcs.size && gcs[pos].match?(/\P{Word}/)
296+
pos += 1 while pos < gcs.size && !word_character?(gcs[pos])
298297
if pos == gcs.size # 'aaa bbb [cursor] '
299-
pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\P{Word}/)
298+
pos -= 1 while pos > 0 && !word_character?(gcs[pos - 1])
300299
second_word_end = gcs.size
301300
else # 'aaa [cursor]bbb'
302-
pos += 1 while pos < gcs.size && gcs[pos].match?(/\p{Word}/)
301+
pos += 1 while pos < gcs.size && word_character?(gcs[pos])
303302
second_word_end = pos
304303
end
305-
pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\p{Word}/)
304+
pos -= 1 while pos > 0 && word_character?(gcs[pos - 1])
306305
second_word_start = pos
307-
pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\P{Word}/)
306+
pos -= 1 while pos > 0 && !word_character?(gcs[pos - 1])
308307
first_word_end = pos
309-
pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\p{Word}/)
308+
pos -= 1 while pos > 0 && word_character?(gcs[pos - 1])
310309
first_word_start = pos
311310

312311
[first_word_start, first_word_end, second_word_start, second_word_end].map do |idx|
@@ -316,72 +315,73 @@ def self.ed_transpose_words(line, byte_pointer)
316315

317316
def self.vi_big_forward_word(line, byte_pointer)
318317
gcs = line.byteslice(byte_pointer..).grapheme_clusters
319-
nonspaces = gcs.take_while { |c| c.match?(/\S/) }
320-
spaces = gcs.drop(nonspaces.size).take_while { |c| c.match?(/\s/) }
318+
nonspaces = gcs.take_while { |c| !space_character?(c) }
319+
spaces = gcs.drop(nonspaces.size).take_while { |c| space_character?(c) }
321320
nonspaces.sum(&:bytesize) + spaces.sum(&:bytesize)
322321
end
323322

324323
def self.vi_big_forward_end_word(line, byte_pointer)
325324
gcs = line.byteslice(byte_pointer..).grapheme_clusters
326325
first = gcs.shift(1)
327-
spaces = gcs.take_while { |c| c.match?(/\s/) }
328-
nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) }
326+
spaces = gcs.take_while { |c| space_character?(c) }
327+
nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) }
329328
matched = spaces + nonspaces
330329
matched.pop
331330
first.sum(&:bytesize) + matched.sum(&:bytesize)
332331
end
333332

334333
def self.vi_big_backward_word(line, byte_pointer)
335334
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
336-
spaces = gcs.take_while { |c| c.match?(/\s/) }
337-
nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) }
335+
spaces = gcs.take_while { |c| space_character?(c) }
336+
nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) }
338337
spaces.sum(&:bytesize) + nonspaces.sum(&:bytesize)
339338
end
340339

341340
def self.vi_forward_word(line, byte_pointer, drop_terminate_spaces = false)
342-
gcs = line.byteslice(byte_pointer..).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) }
341+
gcs = line.byteslice(byte_pointer..).grapheme_clusters
343342
return 0 if gcs.empty?
344343

345-
regexp =
346-
case gcs.first
347-
when /\p{Word}/
348-
/\p{Word}/
349-
when /\s/
350-
/\s/
344+
c = gcs.first
345+
matched =
346+
if word_character?(c)
347+
gcs.take_while { |c| word_character?(c) }
348+
elsif space_character?(c)
349+
gcs.take_while { |c| space_character?(c) }
351350
else
352-
/[^\p{Word}\s]/
351+
gcs.take_while { |c| !word_character?(c) && !space_character?(c) }
353352
end
354-
matched = gcs.take_while { |c| c.match?(regexp) }
353+
355354
return matched.sum(&:bytesize) if drop_terminate_spaces
356355

357-
spaces = gcs.drop(matched.size).take_while { |c| c.match?(/\s/) }
356+
spaces = gcs.drop(matched.size).take_while { |c| space_character?(c) }
358357
matched.sum(&:bytesize) + spaces.sum(&:bytesize)
359358
end
360359

361360
def self.vi_forward_end_word(line, byte_pointer)
362-
gcs = line.byteslice(byte_pointer..).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) }
361+
gcs = line.byteslice(byte_pointer..).grapheme_clusters
363362
return 0 if gcs.empty?
364363
return gcs.first.bytesize if gcs.size == 1
365364

366365
start = gcs.shift
367366
skips = [start]
368-
if start.match?(/\s/) || gcs.first.match?(/\s/)
369-
spaces = gcs.take_while { |c| c.match?(/\s/) }
367+
if space_character?(start) || space_character?(gcs.first)
368+
spaces = gcs.take_while { |c| space_character?(c) }
370369
skips += spaces
371370
gcs.shift(spaces.size)
372371
end
373-
regexp = /\p{Word}/.match?(gcs.first) ? /\p{Word}/ : /[^\p{Word}\s]/
374-
matched = gcs.take_while { |c| c.match?(regexp) }
372+
start_with_word = word_character?(gcs.first)
373+
matched = gcs.take_while { |c| start_with_word ? word_character?(c) : !word_character?(c) && !space_character?(c) }
375374
matched.pop
376375
skips.sum(&:bytesize) + matched.sum(&:bytesize)
377376
end
378377

379378
def self.vi_backward_word(line, byte_pointer)
380-
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) }.reverse
381-
spaces = gcs.take_while { |c| c.match?(/\s/) }
379+
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
380+
spaces = gcs.take_while { |c| space_character?(c) }
382381
gcs.shift(spaces.size)
383-
regexp = /\p{Word}/.match?(gcs.first) ? /\p{Word}/ : /[^\p{Word}\s]/
384-
spaces.sum(&:bytesize) + gcs.take_while { |c| c.match?(regexp) }.sum(&:bytesize)
382+
start_with_word = word_character?(gcs.first)
383+
matched = gcs.take_while { |c| start_with_word ? word_character?(c) : !word_character?(c) && !space_character?(c) }
384+
spaces.sum(&:bytesize) + matched.sum(&:bytesize)
385385
end
386386

387387
def self.common_prefix(list, ignore_case: false)
@@ -399,7 +399,17 @@ def self.common_prefix(list, ignore_case: false)
399399

400400
def self.vi_first_print(line)
401401
gcs = line.grapheme_clusters
402-
spaces = gcs.take_while { |c| c.match?(/\s/) }
402+
spaces = gcs.take_while { |c| space_character?(c) }
403403
spaces.sum(&:bytesize)
404404
end
405+
406+
def self.word_character?(s)
407+
s.encode(Encoding::UTF_8).match?(/\p{Word}/) if s
408+
rescue Encoding::UndefinedConversionError
409+
false
410+
end
411+
412+
def self.space_character?(s)
413+
s.match?(/\s/) if s
414+
end
405415
end

test/reline/test_unicode.rb

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -147,13 +147,15 @@ def test_encoding_conversion
147147

148148
def test_em_forward_word
149149
assert_equal(12, Reline::Unicode.em_forward_word('abc---fooあbar-baz', 3))
150+
assert_equal(11, Reline::Unicode.em_forward_word('abc---fooあbar-baz'.encode('sjis'), 3))
150151
assert_equal(3, Reline::Unicode.em_forward_word('abcfoo', 3))
151152
assert_equal(3, Reline::Unicode.em_forward_word('abc---', 3))
152153
assert_equal(0, Reline::Unicode.em_forward_word('abc', 3))
153154
end
154155

155156
def test_em_forward_word_with_capitalization
156157
assert_equal([12, '---Fooあbar'], Reline::Unicode.em_forward_word_with_capitalization('abc---foOあBar-baz', 3))
158+
assert_equal([11, '---Fooあbar'.encode('sjis')], Reline::Unicode.em_forward_word_with_capitalization('abc---foOあBar-baz'.encode('sjis'), 3))
157159
assert_equal([3, 'Foo'], Reline::Unicode.em_forward_word_with_capitalization('abcfOo', 3))
158160
assert_equal([3, '---'], Reline::Unicode.em_forward_word_with_capitalization('abc---', 3))
159161
assert_equal([0, ''], Reline::Unicode.em_forward_word_with_capitalization('abc', 3))
@@ -162,13 +164,15 @@ def test_em_forward_word_with_capitalization
162164

163165
def test_em_backward_word
164166
assert_equal(12, Reline::Unicode.em_backward_word('abc foo-barあbaz--- xyz', 20))
167+
assert_equal(11, Reline::Unicode.em_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19))
165168
assert_equal(2, Reline::Unicode.em_backward_word(' ', 2))
166169
assert_equal(2, Reline::Unicode.em_backward_word('ab', 2))
167170
assert_equal(0, Reline::Unicode.em_backward_word('ab', 0))
168171
end
169172

170173
def test_em_big_backward_word
171174
assert_equal(16, Reline::Unicode.em_big_backward_word('abc foo-barあbaz--- xyz', 20))
175+
assert_equal(15, Reline::Unicode.em_big_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19))
172176
assert_equal(2, Reline::Unicode.em_big_backward_word(' ', 2))
173177
assert_equal(2, Reline::Unicode.em_big_backward_word('ab', 2))
174178
assert_equal(0, Reline::Unicode.em_big_backward_word('ab', 0))
@@ -184,20 +188,20 @@ def test_ed_transpose_words
184188
assert_equal([3, 5, 6, 8], Reline::Unicode.ed_transpose_words('aa bb cc ', 7))
185189
assert_equal([3, 5, 6, 10], Reline::Unicode.ed_transpose_words('aa bb cc ', 8))
186190
assert_equal([3, 5, 6, 10], Reline::Unicode.ed_transpose_words('aa bb cc ', 9))
187-
word1 = 'fooあ'
188-
word2 = 'barあbaz'
189-
left = 'aaa -'
190-
middle = '- -'
191-
right = '- bbb'
192-
expected = [left.bytesize, (left + word1).bytesize, (left + word1 + middle).bytesize, (left + word1 + middle + word2).bytesize]
193-
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize))
194-
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize))
195-
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize + word2.bytesize - 1))
191+
['sjis', 'utf-8'].each do |encoding|
192+
texts = ['fooあ', 'barあbaz', 'aaa -', '- -', '- bbb']
193+
word1, word2, left, middle, right = texts.map { |text| text.encode(encoding) }
194+
expected = [left.bytesize, (left + word1).bytesize, (left + word1 + middle).bytesize, (left + word1 + middle + word2).bytesize]
195+
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize))
196+
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize))
197+
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize + word2.bytesize - 1))
198+
end
196199
end
197200

198201
def test_vi_big_forward_word
199202
assert_equal(18, Reline::Unicode.vi_big_forward_word('abc---fooあbar-baz xyz', 3))
200203
assert_equal(8, Reline::Unicode.vi_big_forward_word('abcfooあ --', 3))
204+
assert_equal(7, Reline::Unicode.vi_big_forward_word('abcfooあ --'.encode('sjis'), 3))
201205
assert_equal(6, Reline::Unicode.vi_big_forward_word('abcfooあ', 3))
202206
assert_equal(3, Reline::Unicode.vi_big_forward_word('abc- ', 3))
203207
assert_equal(0, Reline::Unicode.vi_big_forward_word('abc', 3))
@@ -211,6 +215,7 @@ def test_vi_big_forward_end_word
211215
assert_equal(1, Reline::Unicode.vi_big_forward_end_word('aa b', 0))
212216
assert_equal(3, Reline::Unicode.vi_big_forward_end_word(' aa b', 0))
213217
assert_equal(15, Reline::Unicode.vi_big_forward_end_word('abc---fooあbar-baz xyz', 3))
218+
assert_equal(14, Reline::Unicode.vi_big_forward_end_word('abc---fooあbar-baz xyz'.encode('sjis'), 3))
214219
assert_equal(3, Reline::Unicode.vi_big_forward_end_word('abcfooあ --', 3))
215220
assert_equal(3, Reline::Unicode.vi_big_forward_end_word('abcfooあ', 3))
216221
assert_equal(2, Reline::Unicode.vi_big_forward_end_word('abc- ', 3))
@@ -219,6 +224,7 @@ def test_vi_big_forward_end_word
219224

220225
def test_vi_big_backward_word
221226
assert_equal(16, Reline::Unicode.vi_big_backward_word('abc foo-barあbaz--- xyz', 20))
227+
assert_equal(15, Reline::Unicode.vi_big_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19))
222228
assert_equal(2, Reline::Unicode.vi_big_backward_word(' ', 2))
223229
assert_equal(2, Reline::Unicode.vi_big_backward_word('ab', 2))
224230
assert_equal(0, Reline::Unicode.vi_big_backward_word('ab', 0))
@@ -227,6 +233,7 @@ def test_vi_big_backward_word
227233
def test_vi_forward_word
228234
assert_equal(3, Reline::Unicode.vi_forward_word('abc---fooあbar-baz', 3))
229235
assert_equal(9, Reline::Unicode.vi_forward_word('abc---fooあbar-baz', 6))
236+
assert_equal(8, Reline::Unicode.vi_forward_word('abc---fooあbar-baz'.encode('sjis'), 6))
230237
assert_equal(6, Reline::Unicode.vi_forward_word('abcfooあ', 3))
231238
assert_equal(3, Reline::Unicode.vi_forward_word('abc---', 3))
232239
assert_equal(0, Reline::Unicode.vi_forward_word('abc', 3))
@@ -235,6 +242,7 @@ def test_vi_forward_word
235242
def test_vi_forward_end_word
236243
assert_equal(2, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz', 3))
237244
assert_equal(8, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz', 6))
245+
assert_equal(7, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz'.encode('sjis'), 6))
238246
assert_equal(3, Reline::Unicode.vi_forward_end_word('abcfooあ', 3))
239247
assert_equal(2, Reline::Unicode.vi_forward_end_word('abc---', 3))
240248
assert_equal(0, Reline::Unicode.vi_forward_end_word('abc', 3))
@@ -243,6 +251,7 @@ def test_vi_forward_end_word
243251
def test_vi_backward_word
244252
assert_equal(3, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz', 20))
245253
assert_equal(9, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz', 17))
254+
assert_equal(8, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 16))
246255
assert_equal(2, Reline::Unicode.vi_backward_word(' ', 2))
247256
assert_equal(2, Reline::Unicode.vi_backward_word('ab', 2))
248257
assert_equal(0, Reline::Unicode.vi_backward_word('ab', 0))
@@ -252,6 +261,24 @@ def test_vi_first_print
252261
assert_equal(3, Reline::Unicode.vi_first_print(' abcdefg'))
253262
assert_equal(3, Reline::Unicode.vi_first_print(' '))
254263
assert_equal(0, Reline::Unicode.vi_first_print('abc'))
264+
assert_equal(0, Reline::Unicode.vi_first_print('あ'))
265+
assert_equal(0, Reline::Unicode.vi_first_print('あ'.encode('sjis')))
255266
assert_equal(0, Reline::Unicode.vi_first_print(''))
256267
end
268+
269+
def test_character_type
270+
assert(Reline::Unicode.word_character?('a'))
271+
assert(Reline::Unicode.word_character?('あ'))
272+
assert(Reline::Unicode.word_character?('あ'.encode('sjis')))
273+
refute(Reline::Unicode.word_character?(33345.chr('sjis')))
274+
refute(Reline::Unicode.word_character?('-'))
275+
refute(Reline::Unicode.word_character?(nil))
276+
277+
assert(Reline::Unicode.space_character?(' '))
278+
refute(Reline::Unicode.space_character?('あ'))
279+
refute(Reline::Unicode.space_character?('あ'.encode('sjis')))
280+
refute(Reline::Unicode.space_character?(33345.chr('sjis')))
281+
refute(Reline::Unicode.space_character?('-'))
282+
refute(Reline::Unicode.space_character?(nil))
283+
end
257284
end

0 commit comments

Comments
 (0)