Skip to content

Commit b6bd58a

Browse files
committed
diff: Fix inline word diff to treat non-alpha multibyte chars as non-word
Previously inline word diff simply used Vim's definition of keyword to determine what is a word, which leads to multi-byte character classes such as emojis and CJK (Chinese/Japanese/Korean) characters all classifying as word characters, leading to entire sentences being grouped as a single word which does not provide meaningful information in a diff highlight. Fix this by treating all non-alphanumeric characters (with class number above 2) as non-word characters, as there is usually no benefit in using word diff on them. These include CJK characters, emojis, and also subscript/superscript numbers. Meanwhile, multi-byte characters like Cyrillic and Greek letters will still continue to considered as words. Note that this is slightly inconsistent with how words are defined elsewhere, as Vim usually considers any character with class >=2 to be a "word". Related: vim#16881 (diff inline highlight)
1 parent 71f17fd commit b6bd58a

File tree

5 files changed

+40
-5
lines changed

5 files changed

+40
-5
lines changed

runtime/doc/options.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2989,7 +2989,10 @@ A jump table for the options with a short description can be found at |Q_op|.
29892989
difference.
29902990
word Use internal diff to perform a
29912991
|word|-wise diff and highlight the
2992-
difference.
2992+
difference. Non-alphanumeric
2993+
multi-byte characters such as emoji
2994+
and CJK characters are considered
2995+
individual words.
29932996

29942997
internal Use the internal diff library. This is
29952998
ignored when 'diffexpr' is set. *E960*

src/diff.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3309,10 +3309,17 @@ diff_find_change_inline_diff(
33093309
char_u *s;
33103310
for (s = curline; *s != NUL;)
33113311
{
3312-
// Always use the first buffer's 'iskeyword' to have a consistent diff
33133312
int new_in_keyword = FALSE;
33143313
if (diff_flags & DIFF_INLINE_WORD)
3315-
new_in_keyword = vim_iswordp_buf(s, curtab->tp_diffbuf[file1_idx]);
3314+
{
3315+
// Always use the first buffer's 'iskeyword' to have a
3316+
// consistent diff.
3317+
// For multibyte chars, only treat alphanumeric chars
3318+
// (class 2) as "word", as other classes such as emojis and
3319+
// CJK ideographs do not usually benefit from word diff as
3320+
// Vim doesn't have a good way to segment them.
3321+
new_in_keyword = (mb_get_class_buf(s, curtab->tp_diffbuf[file1_idx]) == 2);
3322+
}
33163323
if (in_keyword && !new_in_keyword)
33173324
{
33183325
ga_append(curstr, NL);

src/mbyte.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -828,8 +828,8 @@ remove_bom(char_u *s)
828828
* Get class of pointer:
829829
* 0 for blank or NUL
830830
* 1 for punctuation
831-
* 2 for an (ASCII) word character
832-
* >2 for other word characters
831+
* 2 for an alphanumeric word character
832+
* >2 for other word characters, including CJK and emoji
833833
*/
834834
int
835835
mb_get_class(char_u *p)
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
| +0#0000e05#a8a8a8255@1|🚀*0#0000000#ffd7ff255|⛵️*2&#ff404010|一*0&#ffd7ff255|二|三*2&#ff404010|ひ*0&#ffd7ff255|ら|が*0࿈ff13|な*0&#ffd7ff255|Δ+2&#ff404010|έ|λ|τ|α| +0&#ffd7ff255|Δ+2&#ff404010|e|l|t|a| +0&#ffd7ff255|f|o@1|b|a||+1&#ffffff0| +0#0000e05#a8a8a8255@1|🚀*0#0000000#ffd7ff255|🛸*2&#ff404010|一*0&#ffd7ff255|二|四*2&#ff404010|ひ*0&#ffd7ff255|ら|な|δ+2&#ff404010|έ|λ|τ|α| +0&#ffd7ff255|δ+2&#ff404010|e|l|t|a| +0&#ffd7ff255|f|o@1|b|a|r|
2+
|~+0#4040ff13#ffffff0| @35||+1#0000000&|~+0#4040ff13&| @35
3+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
4+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
5+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
6+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
7+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
8+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
9+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
10+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
11+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
12+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
13+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
14+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
15+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
16+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
17+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
18+
|~| @35||+1#0000000&|~+0#4040ff13&| @35
19+
|X+3#0000000&|d|i|f|i|l|e|1| @10|1|,|1| @11|A|l@1| |X+1&&|d|i|f|i|l|e|2| @10|1|,|1| @11|A|l@1
20+
|:+0&&> @73

src/testdir/test_diffmode.vim

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2444,6 +2444,11 @@ func Test_diff_inline()
24442444

24452445
call term_sendkeys(buf, ":windo set iskeyword&\:1wincmd w\")
24462446

2447+
" word diff: test handling of multi-byte characters. Only alphanumeric chars
2448+
" (e.g. Greek alphabet, but not CJK/emoji) count as words.
2449+
call WriteDiffFiles(buf, ["🚀⛵️一二三ひらがなΔέλτα Δelta foobar"], ["🚀🛸一二四ひらなδέλτα δelta foobar"])
2450+
call VerifyInternal(buf, "Test_diff_inline_word_03", " diffopt+=inline:word")
2451+
24472452
" char diff: should slide highlight to whitespace boundary if possible for
24482453
" better readability (by using forced indent-heuristics). A wrong result
24492454
" would be if the highlight is "Bar, prefix". It should be "prefixBar, "

0 commit comments

Comments
 (0)