diff --git a/Documentation/git-update-index.txt b/Documentation/git-update-index.txt index a3081f4e23..9d0b1515c5 100644 --- a/Documentation/git-update-index.txt +++ b/Documentation/git-update-index.txt @@ -19,7 +19,7 @@ SYNOPSIS [--ignore-submodules] [--really-refresh] [--unresolve] [--again | -g] [--info-only] [--index-info] - [-z] [--stdin] + [-z] [--stdin] [--index-version ] [--verbose] [--] [...] @@ -143,6 +143,10 @@ you will need to handle the situation manually. --verbose:: Report what is being added and removed from index. +--index-version :: + Write the resulting index out in the named on-disk format version. + The current default version is 2. + -z:: Only meaningful with `--stdin` or `--index-info`; paths are separated with NUL character instead of LF. diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt index 8930b3fabc..9d25b30178 100644 --- a/Documentation/technical/index-format.txt +++ b/Documentation/technical/index-format.txt @@ -113,9 +113,22 @@ GIT index format are encoded in 7-bit ASCII and the encoding cannot contain a NUL byte (iow, this is a UNIX pathname). + (Version 4) In version 4, the entry path name is prefix-compressed + relative to the path name for the previous entry (the very first + entry is encoded as if the path name for the previous entry is an + empty string). At the beginning of an entry, an integer N in the + variable width encoding (the same encoding as the offset is encoded + for OFS_DELTA pack entries; see pack-format.txt) is stored, followed + by a NUL-terminated string S. Removing N bytes from the end of the + path name for the previous entry, and replacing it with the string S + yields the path name for this entry. + 1-8 nul bytes as necessary to pad the entry to a multiple of eight bytes while keeping the name NUL-terminated. + (Version 4) In version 4, the padding after the pathname does not + exist. + == Extensions === Cached tree diff --git a/Makefile b/Makefile index d6748e0754..a14732c4cd 100644 --- a/Makefile +++ b/Makefile @@ -631,6 +631,7 @@ LIB_H += tree-walk.h LIB_H += unpack-trees.h LIB_H += userdiff.h LIB_H += utf8.h +LIB_H += varint.h LIB_H += xdiff-interface.h LIB_H += xdiff/xdiff.h @@ -757,6 +758,7 @@ LIB_OBJS += url.o LIB_OBJS += usage.o LIB_OBJS += userdiff.o LIB_OBJS += utf8.o +LIB_OBJS += varint.o LIB_OBJS += walker.o LIB_OBJS += wrapper.o LIB_OBJS += write_or_die.o diff --git a/builtin/update-index.c b/builtin/update-index.c index a6a23fa1f3..5f038d64da 100644 --- a/builtin/update-index.c +++ b/builtin/update-index.c @@ -708,6 +708,7 @@ int cmd_update_index(int argc, const char **argv, const char *prefix) int newfd, entries, has_errors = 0, line_termination = '\n'; int read_from_stdin = 0; int prefix_length = prefix ? strlen(prefix) : 0; + int preferred_index_format = 0; char set_executable_bit = 0; struct refresh_params refresh_args = {0, &has_errors}; int lock_error = 0; @@ -791,6 +792,8 @@ int cmd_update_index(int argc, const char **argv, const char *prefix) "(for porcelains) forget saved unresolved conflicts", PARSE_OPT_NOARG | PARSE_OPT_NONEG, resolve_undo_clear_callback}, + OPT_INTEGER(0, "index-version", &preferred_index_format, + "write index in this format"), OPT_END() }; @@ -851,6 +854,17 @@ int cmd_update_index(int argc, const char **argv, const char *prefix) } } argc = parse_options_end(&ctx); + if (preferred_index_format) { + if (preferred_index_format < INDEX_FORMAT_LB || + INDEX_FORMAT_UB < preferred_index_format) + die("index-version %d not in range: %d..%d", + preferred_index_format, + INDEX_FORMAT_LB, INDEX_FORMAT_UB); + + if (the_index.version != preferred_index_format) + active_cache_changed = 1; + the_index.version = preferred_index_format; + } if (read_from_stdin) { struct strbuf buf = STRBUF_INIT, nbuf = STRBUF_INIT; diff --git a/cache.h b/cache.h index 5bf59ff5c3..d8f6f1e78e 100644 --- a/cache.h +++ b/cache.h @@ -105,6 +105,9 @@ struct cache_header { unsigned int hdr_entries; }; +#define INDEX_FORMAT_LB 2 +#define INDEX_FORMAT_UB 4 + /* * The "cache_time" is just the low 32 bits of the * time. It doesn't matter if it overflows - we only @@ -115,48 +118,6 @@ struct cache_time { unsigned int nsec; }; -/* - * dev/ino/uid/gid/size are also just tracked to the low 32 bits - * Again - this is just a (very strong in practice) heuristic that - * the inode hasn't changed. - * - * We save the fields in big-endian order to allow using the - * index file over NFS transparently. - */ -struct ondisk_cache_entry { - struct cache_time ctime; - struct cache_time mtime; - unsigned int dev; - unsigned int ino; - unsigned int mode; - unsigned int uid; - unsigned int gid; - unsigned int size; - unsigned char sha1[20]; - unsigned short flags; - char name[FLEX_ARRAY]; /* more */ -}; - -/* - * This struct is used when CE_EXTENDED bit is 1 - * The struct must match ondisk_cache_entry exactly from - * ctime till flags - */ -struct ondisk_cache_entry_extended { - struct cache_time ctime; - struct cache_time mtime; - unsigned int dev; - unsigned int ino; - unsigned int mode; - unsigned int uid; - unsigned int gid; - unsigned int size; - unsigned char sha1[20]; - unsigned short flags; - unsigned short flags2; - char name[FLEX_ARRAY]; /* more */ -}; - struct cache_entry { struct cache_time ce_ctime; struct cache_time ce_mtime; @@ -253,9 +214,6 @@ static inline size_t ce_namelen(const struct cache_entry *ce) } #define ce_size(ce) cache_entry_size(ce_namelen(ce)) -#define ondisk_ce_size(ce) (((ce)->ce_flags & CE_EXTENDED) ? \ - ondisk_cache_entry_extended_size(ce_namelen(ce)) : \ - ondisk_cache_entry_size(ce_namelen(ce))) #define ce_stage(ce) ((CE_STAGEMASK & (ce)->ce_flags) >> CE_STAGESHIFT) #define ce_uptodate(ce) ((ce)->ce_flags & CE_UPTODATE) #define ce_skip_worktree(ce) ((ce)->ce_flags & CE_SKIP_WORKTREE) @@ -306,13 +264,11 @@ static inline unsigned int canon_mode(unsigned int mode) return S_IFGITLINK; } -#define flexible_size(STRUCT,len) ((offsetof(struct STRUCT,name) + (len) + 8) & ~7) #define cache_entry_size(len) (offsetof(struct cache_entry,name) + (len) + 1) -#define ondisk_cache_entry_size(len) flexible_size(ondisk_cache_entry,len) -#define ondisk_cache_entry_extended_size(len) flexible_size(ondisk_cache_entry_extended,len) struct index_state { struct cache_entry **cache; + unsigned int version; unsigned int cache_nr, cache_alloc, cache_changed; struct string_list *resolve_undo; struct cache_tree *cache_tree; diff --git a/read-cache.c b/read-cache.c index 6c8f395836..ef355cc9a8 100644 --- a/read-cache.c +++ b/read-cache.c @@ -12,6 +12,8 @@ #include "commit.h" #include "blob.h" #include "resolve-undo.h" +#include "strbuf.h" +#include "varint.h" static struct cache_entry *refresh_cache_entry(struct cache_entry *ce, int really); @@ -1179,15 +1181,74 @@ static struct cache_entry *refresh_cache_entry(struct cache_entry *ce, int reall return refresh_cache_ent(&the_index, ce, really, NULL, NULL); } + +/***************************************************************** + * Index File I/O + *****************************************************************/ + +#define INDEX_FORMAT_DEFAULT 3 + +/* + * dev/ino/uid/gid/size are also just tracked to the low 32 bits + * Again - this is just a (very strong in practice) heuristic that + * the inode hasn't changed. + * + * We save the fields in big-endian order to allow using the + * index file over NFS transparently. + */ +struct ondisk_cache_entry { + struct cache_time ctime; + struct cache_time mtime; + unsigned int dev; + unsigned int ino; + unsigned int mode; + unsigned int uid; + unsigned int gid; + unsigned int size; + unsigned char sha1[20]; + unsigned short flags; + char name[FLEX_ARRAY]; /* more */ +}; + +/* + * This struct is used when CE_EXTENDED bit is 1 + * The struct must match ondisk_cache_entry exactly from + * ctime till flags + */ +struct ondisk_cache_entry_extended { + struct cache_time ctime; + struct cache_time mtime; + unsigned int dev; + unsigned int ino; + unsigned int mode; + unsigned int uid; + unsigned int gid; + unsigned int size; + unsigned char sha1[20]; + unsigned short flags; + unsigned short flags2; + char name[FLEX_ARRAY]; /* more */ +}; + +/* These are only used for v3 or lower */ +#define align_flex_name(STRUCT,len) ((offsetof(struct STRUCT,name) + (len) + 8) & ~7) +#define ondisk_cache_entry_size(len) align_flex_name(ondisk_cache_entry,len) +#define ondisk_cache_entry_extended_size(len) align_flex_name(ondisk_cache_entry_extended,len) +#define ondisk_ce_size(ce) (((ce)->ce_flags & CE_EXTENDED) ? \ + ondisk_cache_entry_extended_size(ce_namelen(ce)) : \ + ondisk_cache_entry_size(ce_namelen(ce))) + static int verify_hdr(struct cache_header *hdr, unsigned long size) { git_SHA_CTX c; unsigned char sha1[20]; + int hdr_version; if (hdr->hdr_signature != htonl(CACHE_SIGNATURE)) return error("bad signature"); - if (hdr->hdr_version != htonl(2) && hdr->hdr_version != htonl(3)) - return error("bad index version"); + hdr_version = ntohl(hdr->hdr_version); + if (hdr_version < 2 || 4 < hdr_version) + return error("bad index version %d", hdr_version); git_SHA1_Init(&c); git_SHA1_Update(&c, hdr, size - 20); git_SHA1_Final(sha1, &c); @@ -1221,7 +1282,74 @@ int read_index(struct index_state *istate) return read_index_from(istate, get_index_file()); } -static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk) +#ifndef NEEDS_ALIGNED_ACCESS +#define ntoh_s(var) ntohs(var) +#define ntoh_l(var) ntohl(var) +#else +static inline uint16_t ntoh_s_force_align(void *p) +{ + uint16_t x; + memcpy(&x, p, sizeof(x)); + return ntohs(x); +} +static inline uint32_t ntoh_l_force_align(void *p) +{ + uint32_t x; + memcpy(&x, p, sizeof(x)); + return ntohl(x); +} +#define ntoh_s(var) ntoh_s_force_align(&(var)) +#define ntoh_l(var) ntoh_l_force_align(&(var)) +#endif + +static struct cache_entry *cache_entry_from_ondisk(struct ondisk_cache_entry *ondisk, + unsigned int flags, + const char *name, + size_t len) +{ + struct cache_entry *ce = xmalloc(cache_entry_size(len)); + + ce->ce_ctime.sec = ntoh_l(ondisk->ctime.sec); + ce->ce_mtime.sec = ntoh_l(ondisk->mtime.sec); + ce->ce_ctime.nsec = ntoh_l(ondisk->ctime.nsec); + ce->ce_mtime.nsec = ntoh_l(ondisk->mtime.nsec); + ce->ce_dev = ntoh_l(ondisk->dev); + ce->ce_ino = ntoh_l(ondisk->ino); + ce->ce_mode = ntoh_l(ondisk->mode); + ce->ce_uid = ntoh_l(ondisk->uid); + ce->ce_gid = ntoh_l(ondisk->gid); + ce->ce_size = ntoh_l(ondisk->size); + ce->ce_flags = flags; + hashcpy(ce->sha1, ondisk->sha1); + memcpy(ce->name, name, len); + ce->name[len] = '\0'; + return ce; +} + +/* + * Adjacent cache entries tend to share the leading paths, so it makes + * sense to only store the differences in later entries. In the v4 + * on-disk format of the index, each on-disk cache entry stores the + * number of bytes to be stripped from the end of the previous name, + * and the bytes to append to the result, to come up with its name. + */ +static unsigned long expand_name_field(struct strbuf *name, const char *cp_) +{ + const unsigned char *ep, *cp = (const unsigned char *)cp_; + size_t len = decode_varint(&cp); + + if (name->len < len) + die("malformed name field in the index"); + strbuf_remove(name, name->len - len, len); + for (ep = cp; *ep; ep++) + ; /* find the end */ + strbuf_add(name, cp, ep - cp); + return (const char *)ep + 1 - cp_; +} + +static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk, + unsigned long *ent_size, + struct strbuf *previous_name) { struct cache_entry *ce; size_t len; @@ -1229,14 +1357,14 @@ static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk) unsigned int flags; /* On-disk flags are just 16 bits */ - flags = ntohs(ondisk->flags); + flags = ntoh_s(ondisk->flags); len = flags & CE_NAMEMASK; if (flags & CE_EXTENDED) { struct ondisk_cache_entry_extended *ondisk2; int extended_flags; ondisk2 = (struct ondisk_cache_entry_extended *)ondisk; - extended_flags = ntohs(ondisk2->flags2) << 16; + extended_flags = ntoh_s(ondisk2->flags2) << 16; /* We do not yet understand any bit out of CE_EXTENDED_FLAGS */ if (extended_flags & ~CE_EXTENDED_FLAGS) die("Unknown index entry format %08x", extended_flags); @@ -1246,27 +1374,22 @@ static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk) else name = ondisk->name; - if (len == CE_NAMEMASK) - len = strlen(name); - - ce = xmalloc(cache_entry_size(len)); - - ce->ce_ctime.sec = ntohl(ondisk->ctime.sec); - ce->ce_mtime.sec = ntohl(ondisk->mtime.sec); - ce->ce_ctime.nsec = ntohl(ondisk->ctime.nsec); - ce->ce_mtime.nsec = ntohl(ondisk->mtime.nsec); - ce->ce_dev = ntohl(ondisk->dev); - ce->ce_ino = ntohl(ondisk->ino); - ce->ce_mode = ntohl(ondisk->mode); - ce->ce_uid = ntohl(ondisk->uid); - ce->ce_gid = ntohl(ondisk->gid); - ce->ce_size = ntohl(ondisk->size); - ce->ce_flags = flags; - - hashcpy(ce->sha1, ondisk->sha1); - - memcpy(ce->name, name, len); - ce->name[len] = '\0'; + if (!previous_name) { + /* v3 and earlier */ + if (len == CE_NAMEMASK) + len = strlen(name); + ce = cache_entry_from_ondisk(ondisk, flags, name, len); + + *ent_size = ondisk_ce_size(ce); + } else { + unsigned long consumed; + consumed = expand_name_field(previous_name, name); + ce = cache_entry_from_ondisk(ondisk, flags, + previous_name->buf, + previous_name->len); + + *ent_size = (name - ((char *)ondisk)) + consumed; + } return ce; } @@ -1279,6 +1402,7 @@ int read_index_from(struct index_state *istate, const char *path) struct cache_header *hdr; void *mmap; size_t mmap_size; + struct strbuf previous_name_buf = STRBUF_INIT, *previous_name; errno = EBUSY; if (istate->initialized) @@ -1311,22 +1435,30 @@ int read_index_from(struct index_state *istate, const char *path) if (verify_hdr(hdr, mmap_size) < 0) goto unmap; + istate->version = ntohl(hdr->hdr_version); istate->cache_nr = ntohl(hdr->hdr_entries); istate->cache_alloc = alloc_nr(istate->cache_nr); istate->cache = xcalloc(istate->cache_alloc, sizeof(struct cache_entry *)); istate->initialized = 1; + if (istate->version == 4) + previous_name = &previous_name_buf; + else + previous_name = NULL; + src_offset = sizeof(*hdr); for (i = 0; i < istate->cache_nr; i++) { struct ondisk_cache_entry *disk_ce; struct cache_entry *ce; + unsigned long consumed; disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset); - ce = create_from_disk(disk_ce); + ce = create_from_disk(disk_ce, &consumed, previous_name); set_index_entry(istate, i, ce); - src_offset += ondisk_ce_size(ce); + src_offset += consumed; } + strbuf_release(&previous_name_buf); istate->timestamp.sec = st.st_mtime; istate->timestamp.nsec = ST_MTIME_NSEC(st); @@ -1510,13 +1642,10 @@ static void ce_smudge_racily_clean_entry(struct cache_entry *ce) } } -static int ce_write_entry(git_SHA_CTX *c, int fd, struct cache_entry *ce) +/* Copy miscellaneous fields but not the name */ +static char *copy_cache_entry_to_ondisk(struct ondisk_cache_entry *ondisk, + struct cache_entry *ce) { - int size = ondisk_ce_size(ce); - struct ondisk_cache_entry *ondisk = xcalloc(1, size); - char *name; - int result; - ondisk->ctime.sec = htonl(ce->ce_ctime.sec); ondisk->mtime.sec = htonl(ce->ce_mtime.sec); ondisk->ctime.nsec = htonl(ce->ce_ctime.nsec); @@ -1533,11 +1662,52 @@ static int ce_write_entry(git_SHA_CTX *c, int fd, struct cache_entry *ce) struct ondisk_cache_entry_extended *ondisk2; ondisk2 = (struct ondisk_cache_entry_extended *)ondisk; ondisk2->flags2 = htons((ce->ce_flags & CE_EXTENDED_FLAGS) >> 16); - name = ondisk2->name; + return ondisk2->name; + } + else { + return ondisk->name; + } +} + +static int ce_write_entry(git_SHA_CTX *c, int fd, struct cache_entry *ce, + struct strbuf *previous_name) +{ + int size; + struct ondisk_cache_entry *ondisk; + char *name; + int result; + + if (!previous_name) { + size = ondisk_ce_size(ce); + ondisk = xcalloc(1, size); + name = copy_cache_entry_to_ondisk(ondisk, ce); + memcpy(name, ce->name, ce_namelen(ce)); + } else { + int common, to_remove, prefix_size; + unsigned char to_remove_vi[16]; + for (common = 0; + (ce->name[common] && + common < previous_name->len && + ce->name[common] == previous_name->buf[common]); + common++) + ; /* still matching */ + to_remove = previous_name->len - common; + prefix_size = encode_varint(to_remove, to_remove_vi); + + if (ce->ce_flags & CE_EXTENDED) + size = offsetof(struct ondisk_cache_entry_extended, name); + else + size = offsetof(struct ondisk_cache_entry, name); + size += prefix_size + (ce_namelen(ce) - common + 1); + + ondisk = xcalloc(1, size); + name = copy_cache_entry_to_ondisk(ondisk, ce); + memcpy(name, to_remove_vi, prefix_size); + memcpy(name + prefix_size, ce->name + common, ce_namelen(ce) - common); + + strbuf_splice(previous_name, common, to_remove, + ce->name + common, ce_namelen(ce) - common); } - else - name = ondisk->name; - memcpy(name, ce->name, ce_namelen(ce)); result = ce_write(c, fd, ondisk, size); free(ondisk); @@ -1573,10 +1743,11 @@ int write_index(struct index_state *istate, int newfd) { git_SHA_CTX c; struct cache_header hdr; - int i, err, removed, extended; + int i, err, removed, extended, hdr_version; struct cache_entry **cache = istate->cache; int entries = istate->cache_nr; struct stat st; + struct strbuf previous_name_buf = STRBUF_INIT, *previous_name; for (i = removed = extended = 0; i < entries; i++) { if (cache[i]->ce_flags & CE_REMOVE) @@ -1590,24 +1761,34 @@ int write_index(struct index_state *istate, int newfd) } } + if (!istate->version) + istate->version = INDEX_FORMAT_DEFAULT; + + /* demote version 3 to version 2 when the latter suffices */ + if (istate->version == 3 || istate->version == 2) + istate->version = extended ? 3 : 2; + + hdr_version = istate->version; + hdr.hdr_signature = htonl(CACHE_SIGNATURE); - /* for extended format, increase version so older git won't try to read it */ - hdr.hdr_version = htonl(extended ? 3 : 2); + hdr.hdr_version = htonl(hdr_version); hdr.hdr_entries = htonl(entries - removed); git_SHA1_Init(&c); if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0) return -1; + previous_name = (hdr_version == 4) ? &previous_name_buf : NULL; for (i = 0; i < entries; i++) { struct cache_entry *ce = cache[i]; if (ce->ce_flags & CE_REMOVE) continue; if (!ce_uptodate(ce) && is_racy_timestamp(istate, ce)) ce_smudge_racily_clean_entry(ce); - if (ce_write_entry(&c, newfd, ce) < 0) + if (ce_write_entry(&c, newfd, ce, previous_name) < 0) return -1; } + strbuf_release(&previous_name_buf); /* Write extension data here */ if (istate->cache_tree) { diff --git a/unpack-trees.c b/unpack-trees.c index 36523da22a..1d7393d84c 100644 --- a/unpack-trees.c +++ b/unpack-trees.c @@ -1027,6 +1027,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options o->result.initialized = 1; o->result.timestamp.sec = o->src_index->timestamp.sec; o->result.timestamp.nsec = o->src_index->timestamp.nsec; + o->result.version = o->src_index->version; o->merge_size = len; mark_all_ce_unused(o->src_index); diff --git a/varint.c b/varint.c new file mode 100644 index 0000000000..4ed7729496 --- /dev/null +++ b/varint.c @@ -0,0 +1,29 @@ +#include "varint.h" + +uintmax_t decode_varint(const unsigned char **bufp) +{ + const unsigned char *buf = *bufp; + unsigned char c = *buf++; + uintmax_t val = c & 127; + while (c & 128) { + val += 1; + if (!val || MSB(val, 7)) + return 0; /* overflow */ + c = *buf++; + val = (val << 7) + (c & 127); + } + *bufp = buf; + return val; +} + +int encode_varint(uintmax_t value, unsigned char *buf) +{ + unsigned char varint[16]; + unsigned pos = sizeof(varint) - 1; + varint[pos] = value & 127; + while (value >>= 7) + varint[--pos] = 128 | (--value & 127); + if (buf) + memcpy(buf, varint + pos, sizeof(varint) - pos); + return sizeof(varint) - pos; +} diff --git a/varint.h b/varint.h new file mode 100644 index 0000000000..0321195796 --- /dev/null +++ b/varint.h @@ -0,0 +1,9 @@ +#ifndef VARINT_H +#define VARINT_H + +#include "git-compat-util.h" + +extern int encode_varint(uintmax_t, unsigned char *); +extern uintmax_t decode_varint(const unsigned char **); + +#endif /* VARINT_H */