Skip to content

Commit 2f519c3

Browse files
committed
Make Reline::Unicode's vi_ ed_ em_ method encoding safe
1 parent 85f4405 commit 2f519c3

File tree

2 files changed

+87
-50
lines changed

2 files changed

+87
-50
lines changed

lib/reline/unicode.rb

Lines changed: 51 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -269,50 +269,49 @@ def self.get_prev_mbchar_size(line, byte_pointer)
269269

270270
def self.em_forward_word(line, byte_pointer)
271271
gcs = line.byteslice(byte_pointer..).grapheme_clusters
272-
nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) }
273-
words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) }
272+
nonwords = gcs.take_while { |c| !word_character?(c) }
273+
words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) }
274274
nonwords.sum(&:bytesize) + words.sum(&:bytesize)
275275
end
276276

277277
def self.em_forward_word_with_capitalization(line, byte_pointer)
278278
gcs = line.byteslice(byte_pointer..).grapheme_clusters
279-
nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) }
280-
words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) }
279+
nonwords = gcs.take_while { |c| !word_character?(c) }
280+
words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) }
281281
[nonwords.sum(&:bytesize) + words.sum(&:bytesize), nonwords.join + words.join.capitalize]
282282
end
283283

284284
def self.em_backward_word(line, byte_pointer)
285285
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
286-
nonwords = gcs.take_while { |c| c.encode(Encoding::UTF_8).match?(/\P{Word}/) }
287-
words = gcs.drop(nonwords.size).take_while { |c| c.encode(Encoding::UTF_8).match?(/\p{Word}/) }
286+
nonwords = gcs.take_while { |c| !word_character?(c) }
287+
words = gcs.drop(nonwords.size).take_while { |c| word_character?(c) }
288288
nonwords.sum(&:bytesize) + words.sum(&:bytesize)
289289
end
290290

291291
def self.em_big_backward_word(line, byte_pointer)
292292
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
293-
spaces = gcs.take_while { |c| c.match?(/\s/) }
294-
nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) }
293+
spaces = gcs.take_while { |c| space_character?(c) }
294+
nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) }
295295
spaces.sum(&:bytesize) + nonspaces.sum(&:bytesize)
296296
end
297297

298298
def self.ed_transpose_words(line, byte_pointer)
299299
gcs = line.byteslice(0, byte_pointer).grapheme_clusters
300300
pos = gcs.size
301301
gcs += line.byteslice(byte_pointer..).grapheme_clusters
302-
gcs.map! { |c| c.encode(Encoding::UTF_8) }
303-
pos += 1 while pos < gcs.size && gcs[pos].match?(/\P{Word}/)
302+
pos += 1 while pos < gcs.size && !word_character?(gcs[pos])
304303
if pos == gcs.size # 'aaa bbb [cursor] '
305-
pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\P{Word}/)
304+
pos -= 1 while pos > 0 && !word_character?(gcs[pos - 1])
306305
second_word_end = gcs.size
307306
else # 'aaa [cursor]bbb'
308-
pos += 1 while pos < gcs.size && gcs[pos].match?(/\p{Word}/)
307+
pos += 1 while pos < gcs.size && word_character?(gcs[pos])
309308
second_word_end = pos
310309
end
311-
pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\p{Word}/)
310+
pos -= 1 while pos > 0 && word_character?(gcs[pos - 1])
312311
second_word_start = pos
313-
pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\P{Word}/)
312+
pos -= 1 while pos > 0 && !word_character?(gcs[pos - 1])
314313
first_word_end = pos
315-
pos -= 1 while pos > 0 && gcs[pos - 1].match?(/\p{Word}/)
314+
pos -= 1 while pos > 0 && word_character?(gcs[pos - 1])
316315
first_word_start = pos
317316

318317
[first_word_start, first_word_end, second_word_start, second_word_end].map do |idx|
@@ -322,77 +321,88 @@ def self.ed_transpose_words(line, byte_pointer)
322321

323322
def self.vi_big_forward_word(line, byte_pointer)
324323
gcs = line.byteslice(byte_pointer..).grapheme_clusters
325-
nonspaces = gcs.take_while { |c| c.match?(/\S/) }
326-
spaces = gcs.drop(nonspaces.size).take_while { |c| c.match?(/\s/) }
324+
nonspaces = gcs.take_while { |c| !space_character?(c) }
325+
spaces = gcs.drop(nonspaces.size).take_while { |c| space_character?(c) }
327326
nonspaces.sum(&:bytesize) + spaces.sum(&:bytesize)
328327
end
329328

330329
def self.vi_big_forward_end_word(line, byte_pointer)
331330
gcs = line.byteslice(byte_pointer..).grapheme_clusters
332331
first = gcs.shift(1)
333-
spaces = gcs.take_while { |c| c.match?(/\s/) }
334-
nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) }
332+
spaces = gcs.take_while { |c| space_character?(c) }
333+
nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) }
335334
matched = spaces + nonspaces
336335
matched.pop
337336
first.sum(&:bytesize) + matched.sum(&:bytesize)
338337
end
339338

340339
def self.vi_big_backward_word(line, byte_pointer)
341340
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
342-
spaces = gcs.take_while { |c| c.match?(/\s/) }
343-
nonspaces = gcs.drop(spaces.size).take_while { |c| c.match?(/\S/) }
341+
spaces = gcs.take_while { |c| space_character?(c) }
342+
nonspaces = gcs.drop(spaces.size).take_while { |c| !space_character?(c) }
344343
spaces.sum(&:bytesize) + nonspaces.sum(&:bytesize)
345344
end
346345

347346
def self.vi_forward_word(line, byte_pointer, drop_terminate_spaces = false)
348-
gcs = line.byteslice(byte_pointer..).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) }
347+
gcs = line.byteslice(byte_pointer..).grapheme_clusters
349348
return 0 if gcs.empty?
350349

351-
regexp =
352-
case gcs.first
353-
when /\p{Word}/
354-
/\p{Word}/
355-
when /\s/
356-
/\s/
350+
c = gcs.first
351+
matched =
352+
if word_character?(c)
353+
gcs.take_while { |c| word_character?(c) }
354+
elsif space_character?(c)
355+
gcs.take_while { |c| space_character?(c) }
357356
else
358-
/[^\p{Word}\s]/
357+
gcs.take_while { |c| !word_character?(c) && !space_character?(c) }
359358
end
360-
matched = gcs.take_while { |c| c.match?(regexp) }
359+
361360
return matched.sum(&:bytesize) if drop_terminate_spaces
362361

363-
spaces = gcs.drop(matched.size).take_while { |c| c.match?(/\s/) }
362+
spaces = gcs.drop(matched.size).take_while { |c| space_character?(c) }
364363
matched.sum(&:bytesize) + spaces.sum(&:bytesize)
365364
end
366365

367366
def self.vi_forward_end_word(line, byte_pointer)
368-
gcs = line.byteslice(byte_pointer..).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) }
367+
gcs = line.byteslice(byte_pointer..).grapheme_clusters
369368
return 0 if gcs.empty?
370369
return gcs.first.bytesize if gcs.size == 1
371370

372371
start = gcs.shift
373372
skips = [start]
374-
if start.match?(/\s/) || gcs.first.match?(/\s/)
375-
spaces = gcs.take_while { |c| c.match?(/\s/) }
373+
if space_character?(start) || space_character?(gcs.first)
374+
spaces = gcs.take_while { |c| space_character?(c) }
376375
skips += spaces
377376
gcs.shift(spaces.size)
378377
end
379-
regexp = /\p{Word}/.match?(gcs.first) ? /\p{Word}/ : /[^\p{Word}\s]/
380-
matched = gcs.take_while { |c| c.match?(regexp) }
378+
start_with_word = word_character?(gcs.first)
379+
matched = gcs.take_while { |c| start_with_word ? word_character?(c) : !word_character?(c) && !space_character?(c) }
381380
matched.pop
382381
skips.sum(&:bytesize) + matched.sum(&:bytesize)
383382
end
384383

385384
def self.vi_backward_word(line, byte_pointer)
386-
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.map { |c| c.encode(Encoding::UTF_8) }.reverse
387-
spaces = gcs.take_while { |c| c.match?(/\s/) }
385+
gcs = line.byteslice(0, byte_pointer).grapheme_clusters.reverse
386+
spaces = gcs.take_while { |c| space_character?(c) }
388387
gcs.shift(spaces.size)
389-
regexp = /\p{Word}/.match?(gcs.first) ? /\p{Word}/ : /[^\p{Word}\s]/
390-
spaces.sum(&:bytesize) + gcs.take_while { |c| c.match?(regexp) }.sum(&:bytesize)
388+
start_with_word = word_character?(gcs.first)
389+
matched = gcs.take_while { |c| start_with_word ? word_character?(c) : !word_character?(c) && !space_character?(c) }
390+
spaces.sum(&:bytesize) + matched.sum(&:bytesize)
391391
end
392392

393393
def self.vi_first_print(line)
394394
gcs = line.grapheme_clusters
395-
spaces = gcs.take_while { |c| c.match?(/\s/) }
395+
spaces = gcs.take_while { |c| space_character?(c) }
396396
spaces.sum(&:bytesize)
397397
end
398+
399+
def self.word_character?(s)
400+
s.encode(Encoding::UTF_8).match?(/\p{Word}/) if s
401+
rescue Encoding::UndefinedConversionError
402+
false
403+
end
404+
405+
def self.space_character?(s)
406+
s.match?(/\s/) if s
407+
end
398408
end

test/reline/test_unicode.rb

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -92,13 +92,15 @@ def test_take_mbchar_range
9292

9393
def test_em_forward_word
9494
assert_equal(12, Reline::Unicode.em_forward_word('abc---fooあbar-baz', 3))
95+
assert_equal(11, Reline::Unicode.em_forward_word('abc---fooあbar-baz'.encode('sjis'), 3))
9596
assert_equal(3, Reline::Unicode.em_forward_word('abcfoo', 3))
9697
assert_equal(3, Reline::Unicode.em_forward_word('abc---', 3))
9798
assert_equal(0, Reline::Unicode.em_forward_word('abc', 3))
9899
end
99100

100101
def test_em_forward_word_with_capitalization
101102
assert_equal([12, '---Fooあbar'], Reline::Unicode.em_forward_word_with_capitalization('abc---foOあBar-baz', 3))
103+
assert_equal([11, '---Fooあbar'.encode('sjis')], Reline::Unicode.em_forward_word_with_capitalization('abc---foOあBar-baz'.encode('sjis'), 3))
102104
assert_equal([3, 'Foo'], Reline::Unicode.em_forward_word_with_capitalization('abcfOo', 3))
103105
assert_equal([3, '---'], Reline::Unicode.em_forward_word_with_capitalization('abc---', 3))
104106
assert_equal([0, ''], Reline::Unicode.em_forward_word_with_capitalization('abc', 3))
@@ -107,13 +109,15 @@ def test_em_forward_word_with_capitalization
107109

108110
def test_em_backward_word
109111
assert_equal(12, Reline::Unicode.em_backward_word('abc foo-barあbaz--- xyz', 20))
112+
assert_equal(11, Reline::Unicode.em_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19))
110113
assert_equal(2, Reline::Unicode.em_backward_word(' ', 2))
111114
assert_equal(2, Reline::Unicode.em_backward_word('ab', 2))
112115
assert_equal(0, Reline::Unicode.em_backward_word('ab', 0))
113116
end
114117

115118
def test_em_big_backward_word
116119
assert_equal(16, Reline::Unicode.em_big_backward_word('abc foo-barあbaz--- xyz', 20))
120+
assert_equal(15, Reline::Unicode.em_big_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19))
117121
assert_equal(2, Reline::Unicode.em_big_backward_word(' ', 2))
118122
assert_equal(2, Reline::Unicode.em_big_backward_word('ab', 2))
119123
assert_equal(0, Reline::Unicode.em_big_backward_word('ab', 0))
@@ -129,20 +133,20 @@ def test_ed_transpose_words
129133
assert_equal([3, 5, 6, 8], Reline::Unicode.ed_transpose_words('aa bb cc ', 7))
130134
assert_equal([3, 5, 6, 10], Reline::Unicode.ed_transpose_words('aa bb cc ', 8))
131135
assert_equal([3, 5, 6, 10], Reline::Unicode.ed_transpose_words('aa bb cc ', 9))
132-
word1 = 'fooあ'
133-
word2 = 'barあbaz'
134-
left = 'aaa -'
135-
middle = '- -'
136-
right = '- bbb'
137-
expected = [left.bytesize, (left + word1).bytesize, (left + word1 + middle).bytesize, (left + word1 + middle + word2).bytesize]
138-
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize))
139-
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize))
140-
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize + word2.bytesize - 1))
136+
['sjis', 'utf-8'].each do |encoding|
137+
texts = ['fooあ', 'barあbaz', 'aaa -', '- -', '- bbb']
138+
word1, word2, left, middle, right = texts.map { |text| text.encode(encoding) }
139+
expected = [left.bytesize, (left + word1).bytesize, (left + word1 + middle).bytesize, (left + word1 + middle + word2).bytesize]
140+
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize))
141+
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize))
142+
assert_equal(expected, Reline::Unicode.ed_transpose_words(left + word1 + middle + word2 + right, left.bytesize + word1.bytesize + middle.bytesize + word2.bytesize - 1))
143+
end
141144
end
142145

143146
def test_vi_big_forward_word
144147
assert_equal(18, Reline::Unicode.vi_big_forward_word('abc---fooあbar-baz xyz', 3))
145148
assert_equal(8, Reline::Unicode.vi_big_forward_word('abcfooあ --', 3))
149+
assert_equal(7, Reline::Unicode.vi_big_forward_word('abcfooあ --'.encode('sjis'), 3))
146150
assert_equal(6, Reline::Unicode.vi_big_forward_word('abcfooあ', 3))
147151
assert_equal(3, Reline::Unicode.vi_big_forward_word('abc- ', 3))
148152
assert_equal(0, Reline::Unicode.vi_big_forward_word('abc', 3))
@@ -156,6 +160,7 @@ def test_vi_big_forward_end_word
156160
assert_equal(1, Reline::Unicode.vi_big_forward_end_word('aa b', 0))
157161
assert_equal(3, Reline::Unicode.vi_big_forward_end_word(' aa b', 0))
158162
assert_equal(15, Reline::Unicode.vi_big_forward_end_word('abc---fooあbar-baz xyz', 3))
163+
assert_equal(14, Reline::Unicode.vi_big_forward_end_word('abc---fooあbar-baz xyz'.encode('sjis'), 3))
159164
assert_equal(3, Reline::Unicode.vi_big_forward_end_word('abcfooあ --', 3))
160165
assert_equal(3, Reline::Unicode.vi_big_forward_end_word('abcfooあ', 3))
161166
assert_equal(2, Reline::Unicode.vi_big_forward_end_word('abc- ', 3))
@@ -164,6 +169,7 @@ def test_vi_big_forward_end_word
164169

165170
def test_vi_big_backward_word
166171
assert_equal(16, Reline::Unicode.vi_big_backward_word('abc foo-barあbaz--- xyz', 20))
172+
assert_equal(15, Reline::Unicode.vi_big_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 19))
167173
assert_equal(2, Reline::Unicode.vi_big_backward_word(' ', 2))
168174
assert_equal(2, Reline::Unicode.vi_big_backward_word('ab', 2))
169175
assert_equal(0, Reline::Unicode.vi_big_backward_word('ab', 0))
@@ -172,6 +178,7 @@ def test_vi_big_backward_word
172178
def test_vi_forward_word
173179
assert_equal(3, Reline::Unicode.vi_forward_word('abc---fooあbar-baz', 3))
174180
assert_equal(9, Reline::Unicode.vi_forward_word('abc---fooあbar-baz', 6))
181+
assert_equal(8, Reline::Unicode.vi_forward_word('abc---fooあbar-baz'.encode('sjis'), 6))
175182
assert_equal(6, Reline::Unicode.vi_forward_word('abcfooあ', 3))
176183
assert_equal(3, Reline::Unicode.vi_forward_word('abc---', 3))
177184
assert_equal(0, Reline::Unicode.vi_forward_word('abc', 3))
@@ -180,6 +187,7 @@ def test_vi_forward_word
180187
def test_vi_forward_end_word
181188
assert_equal(2, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz', 3))
182189
assert_equal(8, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz', 6))
190+
assert_equal(7, Reline::Unicode.vi_forward_end_word('abc---fooあbar-baz'.encode('sjis'), 6))
183191
assert_equal(3, Reline::Unicode.vi_forward_end_word('abcfooあ', 3))
184192
assert_equal(2, Reline::Unicode.vi_forward_end_word('abc---', 3))
185193
assert_equal(0, Reline::Unicode.vi_forward_end_word('abc', 3))
@@ -188,6 +196,7 @@ def test_vi_forward_end_word
188196
def test_vi_backward_word
189197
assert_equal(3, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz', 20))
190198
assert_equal(9, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz', 17))
199+
assert_equal(8, Reline::Unicode.vi_backward_word('abc foo-barあbaz--- xyz'.encode('sjis'), 16))
191200
assert_equal(2, Reline::Unicode.vi_backward_word(' ', 2))
192201
assert_equal(2, Reline::Unicode.vi_backward_word('ab', 2))
193202
assert_equal(0, Reline::Unicode.vi_backward_word('ab', 0))
@@ -197,6 +206,24 @@ def test_vi_first_print
197206
assert_equal(3, Reline::Unicode.vi_first_print(' abcdefg'))
198207
assert_equal(3, Reline::Unicode.vi_first_print(' '))
199208
assert_equal(0, Reline::Unicode.vi_first_print('abc'))
209+
assert_equal(0, Reline::Unicode.vi_first_print('あ'))
210+
assert_equal(0, Reline::Unicode.vi_first_print('あ'.encode('sjis')))
200211
assert_equal(0, Reline::Unicode.vi_first_print(''))
201212
end
213+
214+
def test_character_type
215+
assert(Reline::Unicode.word_character?('a'))
216+
assert(Reline::Unicode.word_character?('あ'))
217+
assert(Reline::Unicode.word_character?('あ'.encode('sjis')))
218+
refute(Reline::Unicode.word_character?(33345.chr('sjis')))
219+
refute(Reline::Unicode.word_character?('-'))
220+
refute(Reline::Unicode.word_character?(nil))
221+
222+
assert(Reline::Unicode.space_character?(' '))
223+
refute(Reline::Unicode.space_character?('あ'))
224+
refute(Reline::Unicode.space_character?('あ'.encode('sjis')))
225+
refute(Reline::Unicode.space_character?(33345.chr('sjis')))
226+
refute(Reline::Unicode.space_character?('-'))
227+
refute(Reline::Unicode.space_character?(nil))
228+
end
202229
end

0 commit comments

Comments
 (0)