Browse Source

diff: cache textconv output

Running a textconv filter can take a long time. It's
particularly bad for a large file which needs to be spooled
to disk, but even for small files, the fork+exec overhead
can add up for something like "git log -p".

This patch uses the notes-cache mechanism to keep a fast
cache of textconv output. Caches are stored in
refs/notes/textconv/$x, where $x is the userdiff driver
defined in gitattributes.

Caching is enabled only if diff.$x.cachetextconv is true.

In my test repo, on a commit with 45 jpg and avi files
changed and a textconv to show their exif tags:

  [before]
  $ time git show >/dev/null
  real    0m13.724s
  user    0m12.057s
  sys     0m1.624s

  [after, first run]
  $ git config diff.mfo.cachetextconv true
  $ time git show >/dev/null
  real    0m14.252s
  user    0m12.197s
  sys     0m1.800s

  [after, subsequent runs]
  $ time git show >/dev/null
  real    0m0.352s
  user    0m0.148s
  sys     0m0.200s

So for a slight (3.8%) cost on the first run, we achieve an
almost 40x speed up on subsequent runs.

Signed-off-by: Jeff King <peff@peff.net>
Signed-off-by: Junio C Hamano <gitster@pobox.com>
maint
Jeff King 15 years ago committed by Junio C Hamano
parent
commit
d9bae1a178
  1. 20
      Documentation/gitattributes.txt
  2. 52
      diff.c
  3. 109
      t/t4042-diff-textconv-caching.sh
  4. 9
      userdiff.c
  5. 4
      userdiff.h

20
Documentation/gitattributes.txt

@ -414,6 +414,26 @@ because it quickly conveys the changes you have made), you @@ -414,6 +414,26 @@ because it quickly conveys the changes you have made), you
should generate it separately and send it as a comment _in
addition to_ the usual binary diff that you might send.

Because text conversion can be slow, especially when doing a
large number of them with `git log -p`, git provides a mechanism
to cache the output and use it in future diffs. To enable
caching, set the "cachetextconv" variable in your diff driver's
config. For example:

------------------------
[diff "jpg"]
textconv = exif
cachetextconv = true
------------------------

This will cache the result of running "exif" on each blob
indefinitely. If you change the textconv config variable for a
diff driver, git will automatically invalidate the cache entries
and re-run the textconv filter. If you want to invalidate the
cache manually (e.g., because your version of "exif" was updated
and now produces better output), you can remove the cache
manually with `git update-ref -d refs/notes/textconv/jpg` (where
"jpg" is the name of the diff driver, as in the example above).

Performing a three-way merge
~~~~~~~~~~~~~~~~~~~~~~~~~~~~

52
diff.c

@ -43,7 +43,7 @@ static char diff_colors[][COLOR_MAXLEN] = { @@ -43,7 +43,7 @@ static char diff_colors[][COLOR_MAXLEN] = {
};

static void diff_filespec_load_driver(struct diff_filespec *one);
static size_t fill_textconv(const char *cmd,
static size_t fill_textconv(struct userdiff_driver *driver,
struct diff_filespec *df, char **outbuf);

static int parse_diff_color_slot(const char *var, int ofs)
@ -466,8 +466,8 @@ static void emit_rewrite_diff(const char *name_a, @@ -466,8 +466,8 @@ static void emit_rewrite_diff(const char *name_a,
const char *name_b,
struct diff_filespec *one,
struct diff_filespec *two,
const char *textconv_one,
const char *textconv_two,
struct userdiff_driver *textconv_one,
struct userdiff_driver *textconv_two,
struct diff_options *o)
{
int lc_a, lc_b;
@ -1569,14 +1569,26 @@ void diff_set_mnemonic_prefix(struct diff_options *options, const char *a, const @@ -1569,14 +1569,26 @@ void diff_set_mnemonic_prefix(struct diff_options *options, const char *a, const
options->b_prefix = b;
}

static const char *get_textconv(struct diff_filespec *one)
static struct userdiff_driver *get_textconv(struct diff_filespec *one)
{
if (!DIFF_FILE_VALID(one))
return NULL;
if (!S_ISREG(one->mode))
return NULL;
diff_filespec_load_driver(one);
return one->driver->textconv;
if (!one->driver->textconv)
return NULL;

if (one->driver->textconv_want_cache && !one->driver->textconv_cache) {
struct notes_cache *c = xmalloc(sizeof(*c));
struct strbuf name = STRBUF_INIT;

strbuf_addf(&name, "textconv/%s", one->driver->name);
notes_cache_init(c, name.buf, one->driver->textconv);
one->driver->textconv_cache = c;
}

return one->driver;
}

static void builtin_diff(const char *name_a,
@ -1593,7 +1605,8 @@ static void builtin_diff(const char *name_a, @@ -1593,7 +1605,8 @@ static void builtin_diff(const char *name_a,
const char *set = diff_get_color_opt(o, DIFF_METAINFO);
const char *reset = diff_get_color_opt(o, DIFF_RESET);
const char *a_prefix, *b_prefix;
const char *textconv_one = NULL, *textconv_two = NULL;
struct userdiff_driver *textconv_one = NULL;
struct userdiff_driver *textconv_two = NULL;
struct strbuf header = STRBUF_INIT;

if (DIFF_OPT_TST(o, SUBMODULE_LOG) &&
@ -3888,13 +3901,13 @@ static char *run_textconv(const char *pgm, struct diff_filespec *spec, @@ -3888,13 +3901,13 @@ static char *run_textconv(const char *pgm, struct diff_filespec *spec,
return strbuf_detach(&buf, outsize);
}

static size_t fill_textconv(const char *cmd,
static size_t fill_textconv(struct userdiff_driver *driver,
struct diff_filespec *df,
char **outbuf)
{
size_t size;

if (!cmd) {
if (!driver || !driver->textconv) {
if (!DIFF_FILE_VALID(df)) {
*outbuf = "";
return 0;
@ -3905,8 +3918,29 @@ static size_t fill_textconv(const char *cmd, @@ -3905,8 +3918,29 @@ static size_t fill_textconv(const char *cmd,
return df->size;
}

*outbuf = run_textconv(cmd, df, &size);
if (driver->textconv_cache) {
*outbuf = notes_cache_get(driver->textconv_cache, df->sha1,
&size);
if (*outbuf)
return size;
}

*outbuf = run_textconv(driver->textconv, df, &size);
if (!*outbuf)
die("unable to read files to diff");

if (driver->textconv_cache) {
/* ignore errors, as we might be in a readonly repository */
notes_cache_put(driver->textconv_cache, df->sha1, *outbuf,
size);
/*
* we could save up changes and flush them all at the end,
* but we would need an extra call after all diffing is done.
* Since generating a cache entry is the slow path anyway,
* this extra overhead probably isn't a big deal.
*/
notes_cache_write(driver->textconv_cache);
}

return size;
}

109
t/t4042-diff-textconv-caching.sh

@ -0,0 +1,109 @@ @@ -0,0 +1,109 @@
#!/bin/sh

test_description='test textconv caching'
. ./test-lib.sh

cat >helper <<'EOF'
#!/bin/sh
sed 's/^/converted: /' "$@" >helper.out
cat helper.out
EOF
chmod +x helper

test_expect_success 'setup' '
echo foo content 1 >foo.bin &&
echo bar content 1 >bar.bin &&
git add . &&
git commit -m one &&
echo foo content 2 >foo.bin &&
echo bar content 2 >bar.bin &&
git commit -a -m two &&
echo "*.bin diff=magic" >.gitattributes &&
git config diff.magic.textconv ./helper &&
git config diff.magic.cachetextconv true
'

cat >expect <<EOF
diff --git a/bar.bin b/bar.bin
index fcf9166..28283d5 100644
--- a/bar.bin
+++ b/bar.bin
@@ -1 +1 @@
-converted: bar content 1
+converted: bar content 2
diff --git a/foo.bin b/foo.bin
index d5b9fe3..1345db2 100644
--- a/foo.bin
+++ b/foo.bin
@@ -1 +1 @@
-converted: foo content 1
+converted: foo content 2
EOF

test_expect_success 'first textconv works' '
git diff HEAD^ HEAD >actual &&
test_cmp expect actual
'

test_expect_success 'cached textconv produces same output' '
git diff HEAD^ HEAD >actual &&
test_cmp expect actual
'

test_expect_success 'cached textconv does not run helper' '
rm -f helper.out &&
git diff HEAD^ HEAD >actual &&
test_cmp expect actual &&
! test -r helper.out
'

cat >expect <<EOF
diff --git a/bar.bin b/bar.bin
index fcf9166..28283d5 100644
--- a/bar.bin
+++ b/bar.bin
@@ -1,2 +1,2 @@
converted: other
-converted: bar content 1
+converted: bar content 2
diff --git a/foo.bin b/foo.bin
index d5b9fe3..1345db2 100644
--- a/foo.bin
+++ b/foo.bin
@@ -1,2 +1,2 @@
converted: other
-converted: foo content 1
+converted: foo content 2
EOF
test_expect_success 'changing textconv invalidates cache' '
echo other >other &&
git config diff.magic.textconv "./helper other" &&
git diff HEAD^ HEAD >actual &&
test_cmp expect actual
'

cat >expect <<EOF
diff --git a/bar.bin b/bar.bin
index fcf9166..28283d5 100644
--- a/bar.bin
+++ b/bar.bin
@@ -1,2 +1,2 @@
converted: other
-converted: bar content 1
+converted: bar content 2
diff --git a/foo.bin b/foo.bin
index d5b9fe3..1345db2 100644
--- a/foo.bin
+++ b/foo.bin
@@ -1 +1 @@
-converted: foo content 1
+converted: foo content 2
EOF
test_expect_success 'switching diff driver produces correct results' '
git config diff.moremagic.textconv ./helper &&
echo foo.bin diff=moremagic >>.gitattributes &&
git diff HEAD^ HEAD >actual &&
test_cmp expect actual
'

test_done

9
userdiff.c

@ -1,3 +1,4 @@ @@ -1,3 +1,4 @@
#include "cache.h"
#include "userdiff.h"
#include "cache.h"
#include "attr.h"
@ -167,6 +168,12 @@ static int parse_tristate(int *b, const char *k, const char *v) @@ -167,6 +168,12 @@ static int parse_tristate(int *b, const char *k, const char *v)
return 1;
}

static int parse_bool(int *b, const char *k, const char *v)
{
*b = git_config_bool(k, v);
return 1;
}

int userdiff_config(const char *k, const char *v)
{
struct userdiff_driver *drv;
@ -181,6 +188,8 @@ int userdiff_config(const char *k, const char *v) @@ -181,6 +188,8 @@ int userdiff_config(const char *k, const char *v)
return parse_string(&drv->external, k, v);
if ((drv = parse_driver(k, v, "textconv")))
return parse_string(&drv->textconv, k, v);
if ((drv = parse_driver(k, v, "cachetextconv")))
return parse_bool(&drv->textconv_want_cache, k, v);
if ((drv = parse_driver(k, v, "wordregex")))
return parse_string(&drv->word_regex, k, v);


4
userdiff.h

@ -1,6 +1,8 @@ @@ -1,6 +1,8 @@
#ifndef USERDIFF_H
#define USERDIFF_H

#include "notes-cache.h"

struct userdiff_funcname {
const char *pattern;
int cflags;
@ -13,6 +15,8 @@ struct userdiff_driver { @@ -13,6 +15,8 @@ struct userdiff_driver {
struct userdiff_funcname funcname;
const char *word_regex;
const char *textconv;
struct notes_cache *textconv_cache;
int textconv_want_cache;
};

int userdiff_config(const char *k, const char *v);

Loading…
Cancel
Save