git/convert.c

#include "cache.h"
/*
 * convert.c - convert a file when checking it out and checking it in.
 *
 * This should use the pathname to decide on whether it wants to do some
 * more interesting conversions (automatic gzip/unzip, general format
 * conversions etc etc), but by default it just does automatic CRLF<->LF
 * translation when the "auto_crlf" option is set.
 */

struct text_stat {
	/* CR, LF and CRLF counts */
	unsigned cr, lf, crlf;

	/* These are just approximations! */
	unsigned printable, nonprintable;
};

static void gather_stats(const char *buf, unsigned long size, struct text_stat *stats)
{
	unsigned long i;

	memset(stats, 0, sizeof(*stats));

	for (i = 0; i < size; i++) {
		unsigned char c = buf[i];
		if (c == '\r') {
			stats->cr++;
			if (i+1 < size && buf[i+1] == '\n')
				stats->crlf++;
			continue;
		}
		if (c == '\n') {
			stats->lf++;
			continue;
		}
		if (c == 127)
			/* DEL */
			stats->nonprintable++;
		else if (c < 32) {
			switch (c) {
				/* BS, HT, ESC and FF */
			case '\b': case '\t': case '\033': case '\014':
				stats->printable++;
				break;
			default:
				stats->nonprintable++;
			}
		}
		else
			stats->printable++;
	}
}

/*
 * The same heuristics as diff.c::mmfile_is_binary()
 */
static int is_binary(unsigned long size, struct text_stat *stats)
{

	if ((stats->printable >> 7) < stats->nonprintable)
		return 1;
	/*
	 * Other heuristics? Average line length might be relevant,
	 * as might LF vs CR vs CRLF counts..
	 *
	 * NOTE! It might be normal to have a low ratio of CRLF to LF
	 * (somebody starts with a LF-only file and edits it with an editor
	 * that adds CRLF only to lines that are added..). But do  we
	 * want to support CR-only? Probably not.
	 */
	return 0;
}

int convert_to_git(const char *path, char **bufp, unsigned long *sizep)
{
	char *buffer, *nbuf;
	unsigned long size, nsize;
	struct text_stat stats;

	/*
	 * FIXME! Other pluggable conversions should go here,
	 * based on filename patterns. Right now we just do the
	 * stupid auto-CRLF one.
	 */
	if (!auto_crlf)
		return 0;

	size = *sizep;
	if (!size)
		return 0;
	buffer = *bufp;

	gather_stats(buffer, size, &stats);

	/* No CR? Nothing to convert, regardless. */
	if (!stats.cr)
		return 0;

	/*
	 * We're currently not going to even try to convert stuff
	 * that has bare CR characters. Does anybody do that crazy
	 * stuff?
	 */
	if (stats.cr != stats.crlf)
		return 0;

	/*
	 * And add some heuristics for binary vs text, of course...
	 */
	if (is_binary(size, &stats))
		return 0;

	/*
	 * Ok, allocate a new buffer, fill it in, and return true
	 * to let the caller know that we switched buffers on it.
	 */
	nsize = size - stats.crlf;
	nbuf = xmalloc(nsize);
	*bufp = nbuf;
	*sizep = nsize;
	do {
		unsigned char c = *buffer++;
		if (c != '\r')
			*nbuf++ = c;
	} while (--size);

	return 1;
}

int convert_to_working_tree(const char *path, char **bufp, unsigned long *sizep)
{
	char *buffer, *nbuf;
	unsigned long size, nsize;
	struct text_stat stats;
	unsigned char last;

	/*
	 * FIXME! Other pluggable conversions should go here,
	 * based on filename patterns. Right now we just do the
	 * stupid auto-CRLF one.
	 */
	if (auto_crlf <= 0)
		return 0;

	size = *sizep;
	if (!size)
		return 0;
	buffer = *bufp;

	gather_stats(buffer, size, &stats);

	/* No LF? Nothing to convert, regardless. */
	if (!stats.lf)
		return 0;

	/* Was it already in CRLF format? */
	if (stats.lf == stats.crlf)
		return 0;

	/* If we have any bare CR characters, we're not going to touch it */
	if (stats.cr != stats.crlf)
		return 0;

	if (is_binary(size, &stats))
		return 0;

	/*
	 * Ok, allocate a new buffer, fill it in, and return true
	 * to let the caller know that we switched buffers on it.
	 */
	nsize = size + stats.lf - stats.crlf;
	nbuf = xmalloc(nsize);
	*bufp = nbuf;
	*sizep = nsize;
	last = 0;
	do {
		unsigned char c = *buffer++;
		if (c == '\n' && last != '\r')
			*nbuf++ = '\r';
		*nbuf++ = c;
		last = c;
	} while (--size);

	return 1;
}
Lazy man's auto-CRLF It currently does NOT know about file attributes, so it does its conversion purely based on content. Maybe that is more in the "git philosophy" anyway, since content is king, but I think we should try to do the file attributes to turn it off on demand. Anyway, BY DEFAULT it is off regardless, because it requires a [core] AutoCRLF = true in your config file to be enabled. We could make that the default for Windows, of course, the same way we do some other things (filemode etc). But you can actually enable it on UNIX, and it will cause: - "git update-index" will write blobs without CRLF - "git diff" will diff working tree files without CRLF - "git checkout" will write files to the working tree _with_ CRLF and things work fine. Funnily, it actually shows an odd file in git itself: git clone -n git test-crlf cd test-crlf git config core.autocrlf true git checkout git diff shows a diff for "Documentation/docbook-xsl.css". Why? Because we have actually checked in that file with CRLF! So when "core.autocrlf" is true, we'll always generate a different hash for it in the index, because the index hash will be for the content _without_ CRLF. Is this complete? I dunno. It seems to work for me. It doesn't use the filename at all right now, and that's probably a deficiency (we could certainly make the "is_binary()" heuristics also take standard filename heuristics into account). I don't pass in the filename at all for the "index_fd()" case (git-update-index), so that would need to be passed around, but this actually works fine. NOTE NOTE NOTE! The "is_binary()" heuristics are totally made-up by yours truly. I will not guarantee that they work at all reasonable. Caveat emptor. But it _is_ simple, and it _is_ safe, since it's all off by default. The patch is pretty simple - the biggest part is the new "convert.c" file, but even that is really just basic stuff that anybody can write in "Teaching C 101" as a final project for their first class in programming. Not to say that it's bug-free, of course - but at least we're not talking about rocket surgery here. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Junio C Hamano <junkio@cox.net> 18 years ago			`#include "cache.h"`
			`/*`
			`* convert.c - convert a file when checking it out and checking it in.`
			`*`
			`* This should use the pathname to decide on whether it wants to do some`
			`* more interesting conversions (automatic gzip/unzip, general format`
			`* conversions etc etc), but by default it just does automatic CRLF<->LF`
			`* translation when the "auto_crlf" option is set.`
			`*/`

			`struct text_stat {`
			`/* CR, LF and CRLF counts */`
			`unsigned cr, lf, crlf;`

			`/* These are just approximations! */`
			`unsigned printable, nonprintable;`
			`};`

			`static void gather_stats(const char buf, unsigned long size, struct text_stat stats)`
			`{`
			`unsigned long i;`

			`memset(stats, 0, sizeof(*stats));`

			`for (i = 0; i < size; i++) {`
			`unsigned char c = buf[i];`
			`if (c == '\r') {`
			`stats->cr++;`
			`if (i+1 < size && buf[i+1] == '\n')`
			`stats->crlf++;`
			`continue;`
			`}`
			`if (c == '\n') {`
			`stats->lf++;`
			`continue;`
			`}`
			`if (c == 127)`
			`/* DEL */`
			`stats->nonprintable++;`
			`else if (c < 32) {`
			`switch (c) {`
			`/* BS, HT, ESC and FF */`
			`case '\b': case '\t': case '\033': case '\014':`
			`stats->printable++;`
			`break;`
			`default:`
			`stats->nonprintable++;`
			`}`
			`}`
			`else`
			`stats->printable++;`
			`}`
			`}`

			`/*`
			`* The same heuristics as diff.c::mmfile_is_binary()`
			`*/`
			`static int is_binary(unsigned long size, struct text_stat *stats)`
			`{`

			`if ((stats->printable >> 7) < stats->nonprintable)`
			`return 1;`
			`/*`
			`* Other heuristics? Average line length might be relevant,`
			`* as might LF vs CR vs CRLF counts..`
			`*`
			`* NOTE! It might be normal to have a low ratio of CRLF to LF`
			`* (somebody starts with a LF-only file and edits it with an editor`
			`* that adds CRLF only to lines that are added..). But do we`
			`* want to support CR-only? Probably not.`
			`*/`
			`return 0;`
			`}`

			`int convert_to_git(const char path, char bufp, unsigned long sizep)`
			`{`
			`char buffer, nbuf;`
			`unsigned long size, nsize;`
			`struct text_stat stats;`

			`/*`
			`* FIXME! Other pluggable conversions should go here,`
			`* based on filename patterns. Right now we just do the`
			`* stupid auto-CRLF one.`
			`*/`
			`if (!auto_crlf)`
			`return 0;`

			`size = *sizep;`
			`if (!size)`
			`return 0;`
			`buffer = *bufp;`

			`gather_stats(buffer, size, &stats);`

			`/* No CR? Nothing to convert, regardless. */`
			`if (!stats.cr)`
			`return 0;`

			`/*`
			`* We're currently not going to even try to convert stuff`
			`* that has bare CR characters. Does anybody do that crazy`
			`* stuff?`
			`*/`
			`if (stats.cr != stats.crlf)`
			`return 0;`

			`/*`
			`* And add some heuristics for binary vs text, of course...`
			`*/`
			`if (is_binary(size, &stats))`
			`return 0;`

			`/*`
			`* Ok, allocate a new buffer, fill it in, and return true`
			`* to let the caller know that we switched buffers on it.`
			`*/`
			`nsize = size - stats.crlf;`
			`nbuf = xmalloc(nsize);`
			`*bufp = nbuf;`
			`*sizep = nsize;`
			`do {`
			`unsigned char c = *buffer++;`
			`if (c != '\r')`
			`*nbuf++ = c;`
			`} while (--size);`

			`return 1;`
			`}`

			`int convert_to_working_tree(const char path, char bufp, unsigned long sizep)`
			`{`
			`char buffer, nbuf;`
			`unsigned long size, nsize;`
			`struct text_stat stats;`
			`unsigned char last;`

			`/*`
			`* FIXME! Other pluggable conversions should go here,`
			`* based on filename patterns. Right now we just do the`
			`* stupid auto-CRLF one.`
			`*/`
Make AutoCRLF ternary variable. This allows you to do: [core] AutoCRLF = input and it should do only the CRLF->LF translation (ie it simplifies CRLF only when reading working tree files, but when checking out files, it leaves the LF alone, and doesn't turn it into a CRLF). Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Junio C Hamano <junkio@cox.net> 18 years ago			`if (auto_crlf <= 0)`
Lazy man's auto-CRLF It currently does NOT know about file attributes, so it does its conversion purely based on content. Maybe that is more in the "git philosophy" anyway, since content is king, but I think we should try to do the file attributes to turn it off on demand. Anyway, BY DEFAULT it is off regardless, because it requires a [core] AutoCRLF = true in your config file to be enabled. We could make that the default for Windows, of course, the same way we do some other things (filemode etc). But you can actually enable it on UNIX, and it will cause: - "git update-index" will write blobs without CRLF - "git diff" will diff working tree files without CRLF - "git checkout" will write files to the working tree _with_ CRLF and things work fine. Funnily, it actually shows an odd file in git itself: git clone -n git test-crlf cd test-crlf git config core.autocrlf true git checkout git diff shows a diff for "Documentation/docbook-xsl.css". Why? Because we have actually checked in that file with CRLF! So when "core.autocrlf" is true, we'll always generate a different hash for it in the index, because the index hash will be for the content _without_ CRLF. Is this complete? I dunno. It seems to work for me. It doesn't use the filename at all right now, and that's probably a deficiency (we could certainly make the "is_binary()" heuristics also take standard filename heuristics into account). I don't pass in the filename at all for the "index_fd()" case (git-update-index), so that would need to be passed around, but this actually works fine. NOTE NOTE NOTE! The "is_binary()" heuristics are totally made-up by yours truly. I will not guarantee that they work at all reasonable. Caveat emptor. But it _is_ simple, and it _is_ safe, since it's all off by default. The patch is pretty simple - the biggest part is the new "convert.c" file, but even that is really just basic stuff that anybody can write in "Teaching C 101" as a final project for their first class in programming. Not to say that it's bug-free, of course - but at least we're not talking about rocket surgery here. Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Junio C Hamano <junkio@cox.net> 18 years ago			`return 0;`

			`size = *sizep;`
			`if (!size)`
			`return 0;`
			`buffer = *bufp;`

			`gather_stats(buffer, size, &stats);`

			`/* No LF? Nothing to convert, regardless. */`
			`if (!stats.lf)`
			`return 0;`

			`/* Was it already in CRLF format? */`
			`if (stats.lf == stats.crlf)`
			`return 0;`

			`/* If we have any bare CR characters, we're not going to touch it */`
			`if (stats.cr != stats.crlf)`
			`return 0;`

			`if (is_binary(size, &stats))`
			`return 0;`

			`/*`
			`* Ok, allocate a new buffer, fill it in, and return true`
			`* to let the caller know that we switched buffers on it.`
			`*/`
			`nsize = size + stats.lf - stats.crlf;`
			`nbuf = xmalloc(nsize);`
			`*bufp = nbuf;`
			`*sizep = nsize;`
			`last = 0;`
			`do {`
			`unsigned char c = *buffer++;`
			`if (c == '\n' && last != '\r')`
			`*nbuf++ = '\r';`
			`*nbuf++ = c;`
			`last = c;`
			`} while (--size);`

			`return 1;`
			`}`