diff --git a/utf8.c b/utf8.c index 83af3a9f6b..25d366d6b3 100644 --- a/utf8.c +++ b/utf8.c @@ -586,6 +586,19 @@ int has_prohibited_utf_bom(const char *enc, const char *data, size_t len) ); } +int is_missing_required_utf_bom(const char *enc, const char *data, size_t len) +{ + return ( + (same_utf_encoding(enc, "UTF-16")) && + !(has_bom_prefix(data, len, utf16_be_bom, sizeof(utf16_be_bom)) || + has_bom_prefix(data, len, utf16_le_bom, sizeof(utf16_le_bom))) + ) || ( + (same_utf_encoding(enc, "UTF-32")) && + !(has_bom_prefix(data, len, utf32_be_bom, sizeof(utf32_be_bom)) || + has_bom_prefix(data, len, utf32_le_bom, sizeof(utf32_le_bom))) + ); +} + /* * Returns first character length in bytes for multi-byte `text` according to * `encoding`. diff --git a/utf8.h b/utf8.h index 0db1db4519..cce654a64a 100644 --- a/utf8.h +++ b/utf8.h @@ -79,4 +79,23 @@ void strbuf_utf8_align(struct strbuf *buf, align_type position, unsigned int wid */ int has_prohibited_utf_bom(const char *enc, const char *data, size_t len); +/* + * If the endianness is not defined in the encoding name, then we + * require a BOM. The function returns true if a required BOM is missing. + * + * The Unicode standard instructs to assume big-endian if there in no + * BOM for UTF-16/32 [1][2]. However, the W3C/WHATWG encoding standard + * used in HTML5 recommends to assume little-endian to "deal with + * deployed content" [3]. + * + * Therefore, strictly requiring a BOM seems to be the safest option for + * content in Git. + * + * [1] http://unicode.org/faq/utf_bom.html#gen6 + * [2] http://www.unicode.org/versions/Unicode10.0.0/ch03.pdf + * Section 3.10, D98, page 132 + * [3] https://encoding.spec.whatwg.org/#utf-16le + */ +int is_missing_required_utf_bom(const char *enc, const char *data, size_t len); + #endif