git/t/lib-unicode-nfc-nfd.sh

# Help detect how Unicode NFC and NFD are handled on the filesystem.

# A simple character that has a NFD form.
#
# NFC:       U+00e9 LATIN SMALL LETTER E WITH ACUTE
# UTF8(NFC): \xc3 \xa9
#
# NFD:       U+0065 LATIN SMALL LETTER E
#            U+0301 COMBINING ACUTE ACCENT
# UTF8(NFD): \x65  +  \xcc \x81
#
utf8_nfc=$(printf "\xc3\xa9")
utf8_nfd=$(printf "\x65\xcc\x81")

# Is the OS or the filesystem "Unicode composition sensitive"?
#
# That is, does the OS or the filesystem allow files to exist with
# both the NFC and NFD spellings?  Or, does the OS/FS lie to us and
# tell us that the NFC and NFD forms are equivalent.
#
# This is or may be independent of what type of filesystem we have,
# since it might be handled by the OS at a layer above the FS.
# Testing shows on MacOS using APFS, HFS+, and FAT32 reports a
# collision, for example.
#
# This does not tell us how the Unicode pathname will be spelled
# on disk, but rather only that the two spelling "collide".  We
# will examine the actual on disk spelling in a later prereq.
#
test_lazy_prereq UNICODE_COMPOSITION_SENSITIVE '
	mkdir trial_${utf8_nfc} &&
	mkdir trial_${utf8_nfd}
'

# Is the spelling of an NFC pathname preserved on disk?
#
# On MacOS with HFS+ and FAT32, NFC paths are converted into NFD
# and on APFS, NFC paths are preserved.  As we have established
# above, this is independent of "composition sensitivity".
#
test_lazy_prereq UNICODE_NFC_PRESERVED '
	mkdir c_${utf8_nfc} &&
	ls | test-tool hexdump >dump &&
	grep "63 5f c3 a9" dump
'

# Is the spelling of an NFD pathname preserved on disk?
#
test_lazy_prereq UNICODE_NFD_PRESERVED '
	mkdir d_${utf8_nfd} &&
	ls | test-tool hexdump >dump &&
	grep "64 5f 65 cc 81" dump
'

# The following _DOUBLE_ forms are more for my curiosity,
# but there may be quirks lurking when there are multiple
# combining characters in non-canonical order.

# Unicode also allows multiple combining characters
# that can be decomposed in pieces.
#
# NFC:        U+1f67 GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI
# UTF8(NFC):  \xe1 \xbd \xa7
#
# NFD1:       U+1f61 GREEK SMALL LETTER OMEGA WITH DASIA
#             U+0342 COMBINING GREEK PERISPOMENI
# UTF8(NFD1): \xe1 \xbd \xa1  +  \xcd \x82
#
# But U+1f61 decomposes into
# NFD2:       U+03c9 GREEK SMALL LETTER OMEGA
#             U+0314 COMBINING REVERSED COMMA ABOVE
# UTF8(NFD2): \xcf \x89  +  \xcc \x94
#
# Yielding:   \xcf \x89  +  \xcc \x94  +  \xcd \x82
#
# Note that I've used the canonical ordering of the
# combinining characters.  It is also possible to
# swap them.  My testing shows that that non-standard
# ordering also causes a collision in mkdir.  However,
# the resulting names don't draw correctly on the
# terminal (implying that the on-disk format also has
# them out of order).
#
greek_nfc=$(printf "\xe1\xbd\xa7")
greek_nfd1=$(printf "\xe1\xbd\xa1\xcd\x82")
greek_nfd2=$(printf "\xcf\x89\xcc\x94\xcd\x82")

# See if a double decomposition also collides.
#
test_lazy_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE '
	mkdir trial_${greek_nfc} &&
	mkdir trial_${greek_nfd2}
'

# See if the NFC spelling appears on the disk.
#
test_lazy_prereq UNICODE_DOUBLE_NFC_PRESERVED '
	mkdir c_${greek_nfc} &&
	ls | test-tool hexdump >dump &&
	grep "63 5f e1 bd a7" dump
'

# See if the NFD spelling appears on the disk.
#
test_lazy_prereq UNICODE_DOUBLE_NFD_PRESERVED '
	mkdir d_${greek_nfd2} &&
	ls | test-tool hexdump >dump &&
	grep "64 5f cf 89 cc 94 cd 82" dump
'

# The following is for debugging. I found it useful when
# trying to understand the various (OS, FS) quirks WRT
# Unicode and how composition/decomposition is handled.
# For example, when trying to understand how (macOS, APFS)
# and (macOS, HFS) and (macOS, FAT32) compare.
#
# It is rather noisy, so it is disabled by default.
#
if test "$unicode_debug" = "true"
then
	if test_have_prereq UNICODE_COMPOSITION_SENSITIVE
	then
		echo NFC and NFD are distinct on this OS/filesystem.
	else
		echo NFC and NFD are aliases on this OS/filesystem.
	fi

	if test_have_prereq UNICODE_NFC_PRESERVED
	then
		echo NFC maintains original spelling.
	else
		echo NFC is modified.
	fi

	if test_have_prereq UNICODE_NFD_PRESERVED
	then
		echo NFD maintains original spelling.
	else
		echo NFD is modified.
	fi

	if test_have_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE
	then
		echo DOUBLE NFC and NFD are distinct on this OS/filesystem.
	else
		echo DOUBLE NFC and NFD are aliases on this OS/filesystem.
	fi

	if test_have_prereq UNICODE_DOUBLE_NFC_PRESERVED
	then
		echo Double NFC maintains original spelling.
	else
		echo Double NFC is modified.
	fi

	if test_have_prereq UNICODE_DOUBLE_NFD_PRESERVED
	then
		echo Double NFD maintains original spelling.
	else
		echo Double NFD is modified.
	fi
fi
t/lib-unicode-nfc-nfd: helper prereqs for testing unicode nfc/nfd Create a set of prereqs to help understand how file names are handled by the filesystem when they contain NFC and NFD Unicode characters. Signed-off-by: Jeff Hostetler <jeffhost@microsoft.com> Signed-off-by: Junio C Hamano <gitster@pobox.com> 2 years ago			`# Help detect how Unicode NFC and NFD are handled on the filesystem.`

			`# A simple character that has a NFD form.`
			`#`
			`# NFC: U+00e9 LATIN SMALL LETTER E WITH ACUTE`
			`# UTF8(NFC): \xc3 \xa9`
			`#`
			`# NFD: U+0065 LATIN SMALL LETTER E`
			`# U+0301 COMBINING ACUTE ACCENT`
			`# UTF8(NFD): \x65 + \xcc \x81`
			`#`
			`utf8_nfc=$(printf "\xc3\xa9")`
			`utf8_nfd=$(printf "\x65\xcc\x81")`

			`# Is the OS or the filesystem "Unicode composition sensitive"?`
			`#`
			`# That is, does the OS or the filesystem allow files to exist with`
			`# both the NFC and NFD spellings? Or, does the OS/FS lie to us and`
			`# tell us that the NFC and NFD forms are equivalent.`
			`#`
			`# This is or may be independent of what type of filesystem we have,`
			`# since it might be handled by the OS at a layer above the FS.`
			`# Testing shows on MacOS using APFS, HFS+, and FAT32 reports a`
			`# collision, for example.`
			`#`
			`# This does not tell us how the Unicode pathname will be spelled`
			`# on disk, but rather only that the two spelling "collide". We`
			`# will examine the actual on disk spelling in a later prereq.`
			`#`
			`test_lazy_prereq UNICODE_COMPOSITION_SENSITIVE '`
			`mkdir trial_${utf8_nfc} &&`
			`mkdir trial_${utf8_nfd}`
			`'`

			`# Is the spelling of an NFC pathname preserved on disk?`
			`#`
			`# On MacOS with HFS+ and FAT32, NFC paths are converted into NFD`
			`# and on APFS, NFC paths are preserved. As we have established`
			`# above, this is independent of "composition sensitivity".`
			`#`
			`test_lazy_prereq UNICODE_NFC_PRESERVED '`
			`mkdir c_${utf8_nfc} &&`
			`ls \| test-tool hexdump >dump &&`
			`grep "63 5f c3 a9" dump`
			`'`

			`# Is the spelling of an NFD pathname preserved on disk?`
			`#`
			`test_lazy_prereq UNICODE_NFD_PRESERVED '`
			`mkdir d_${utf8_nfd} &&`
			`ls \| test-tool hexdump >dump &&`
			`grep "64 5f 65 cc 81" dump`
			`'`

			`# The following _DOUBLE_ forms are more for my curiosity,`
			`# but there may be quirks lurking when there are multiple`
			`# combining characters in non-canonical order.`

			`# Unicode also allows multiple combining characters`
			`# that can be decomposed in pieces.`
			`#`
			`# NFC: U+1f67 GREEK SMALL LETTER OMEGA WITH DASIA AND PERISPOMENI`
			`# UTF8(NFC): \xe1 \xbd \xa7`
			`#`
			`# NFD1: U+1f61 GREEK SMALL LETTER OMEGA WITH DASIA`
			`# U+0342 COMBINING GREEK PERISPOMENI`
			`# UTF8(NFD1): \xe1 \xbd \xa1 + \xcd \x82`
			`#`
			`# But U+1f61 decomposes into`
			`# NFD2: U+03c9 GREEK SMALL LETTER OMEGA`
			`# U+0314 COMBINING REVERSED COMMA ABOVE`
			`# UTF8(NFD2): \xcf \x89 + \xcc \x94`
			`#`
			`# Yielding: \xcf \x89 + \xcc \x94 + \xcd \x82`
			`#`
			`# Note that I've used the canonical ordering of the`
			`# combinining characters. It is also possible to`
			`# swap them. My testing shows that that non-standard`
			`# ordering also causes a collision in mkdir. However,`
			`# the resulting names don't draw correctly on the`
			`# terminal (implying that the on-disk format also has`
			`# them out of order).`
			`#`
			`greek_nfc=$(printf "\xe1\xbd\xa7")`
			`greek_nfd1=$(printf "\xe1\xbd\xa1\xcd\x82")`
			`greek_nfd2=$(printf "\xcf\x89\xcc\x94\xcd\x82")`

			`# See if a double decomposition also collides.`
			`#`
			`test_lazy_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE '`
			`mkdir trial_${greek_nfc} &&`
			`mkdir trial_${greek_nfd2}`
			`'`

			`# See if the NFC spelling appears on the disk.`
			`#`
			`test_lazy_prereq UNICODE_DOUBLE_NFC_PRESERVED '`
			`mkdir c_${greek_nfc} &&`
			`ls \| test-tool hexdump >dump &&`
			`grep "63 5f e1 bd a7" dump`
			`'`

			`# See if the NFD spelling appears on the disk.`
			`#`
			`test_lazy_prereq UNICODE_DOUBLE_NFD_PRESERVED '`
			`mkdir d_${greek_nfd2} &&`
			`ls \| test-tool hexdump >dump &&`
			`grep "64 5f cf 89 cc 94 cd 82" dump`
			`'`

			`# The following is for debugging. I found it useful when`
			`# trying to understand the various (OS, FS) quirks WRT`
			`# Unicode and how composition/decomposition is handled.`
			`# For example, when trying to understand how (macOS, APFS)`
			`# and (macOS, HFS) and (macOS, FAT32) compare.`
			`#`
			`# It is rather noisy, so it is disabled by default.`
			`#`
			`if test "$unicode_debug" = "true"`
			`then`
			`if test_have_prereq UNICODE_COMPOSITION_SENSITIVE`
			`then`
			`echo NFC and NFD are distinct on this OS/filesystem.`
			`else`
			`echo NFC and NFD are aliases on this OS/filesystem.`
			`fi`

			`if test_have_prereq UNICODE_NFC_PRESERVED`
			`then`
			`echo NFC maintains original spelling.`
			`else`
			`echo NFC is modified.`
			`fi`

			`if test_have_prereq UNICODE_NFD_PRESERVED`
			`then`
			`echo NFD maintains original spelling.`
			`else`
			`echo NFD is modified.`
			`fi`

			`if test_have_prereq UNICODE_DOUBLE_COMPOSITION_SENSITIVE`
			`then`
			`echo DOUBLE NFC and NFD are distinct on this OS/filesystem.`
			`else`
			`echo DOUBLE NFC and NFD are aliases on this OS/filesystem.`
			`fi`

			`if test_have_prereq UNICODE_DOUBLE_NFC_PRESERVED`
			`then`
			`echo Double NFC maintains original spelling.`
			`else`
			`echo Double NFC is modified.`
			`fi`

			`if test_have_prereq UNICODE_DOUBLE_NFD_PRESERVED`
			`then`
			`echo Double NFD maintains original spelling.`
			`else`
			`echo Double NFD is modified.`
			`fi`
			`fi`