From d2c1898571a6a2324593e92163e8754880e0c1fb Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 3 Apr 2012 15:53:08 -0700 Subject: [PATCH 01/12] varint: make it available outside the context of pack Signed-off-by: Junio C Hamano --- Makefile | 2 ++ varint.c | 29 +++++++++++++++++++++++++++++ varint.h | 9 +++++++++ 3 files changed, 40 insertions(+) create mode 100644 varint.c create mode 100644 varint.h diff --git a/Makefile b/Makefile index be1957a5e9..0f26c879d8 100644 --- a/Makefile +++ b/Makefile @@ -627,6 +627,7 @@ LIB_H += tree-walk.h LIB_H += unpack-trees.h LIB_H += userdiff.h LIB_H += utf8.h +LIB_H += varint.h LIB_H += xdiff-interface.h LIB_H += xdiff/xdiff.h @@ -752,6 +753,7 @@ LIB_OBJS += url.o LIB_OBJS += usage.o LIB_OBJS += userdiff.o LIB_OBJS += utf8.o +LIB_OBJS += varint.o LIB_OBJS += walker.o LIB_OBJS += wrapper.o LIB_OBJS += write_or_die.o diff --git a/varint.c b/varint.c new file mode 100644 index 0000000000..4ed7729496 --- /dev/null +++ b/varint.c @@ -0,0 +1,29 @@ +#include "varint.h" + +uintmax_t decode_varint(const unsigned char **bufp) +{ + const unsigned char *buf = *bufp; + unsigned char c = *buf++; + uintmax_t val = c & 127; + while (c & 128) { + val += 1; + if (!val || MSB(val, 7)) + return 0; /* overflow */ + c = *buf++; + val = (val << 7) + (c & 127); + } + *bufp = buf; + return val; +} + +int encode_varint(uintmax_t value, unsigned char *buf) +{ + unsigned char varint[16]; + unsigned pos = sizeof(varint) - 1; + varint[pos] = value & 127; + while (value >>= 7) + varint[--pos] = 128 | (--value & 127); + if (buf) + memcpy(buf, varint + pos, sizeof(varint) - pos); + return sizeof(varint) - pos; +} diff --git a/varint.h b/varint.h new file mode 100644 index 0000000000..0321195796 --- /dev/null +++ b/varint.h @@ -0,0 +1,9 @@ +#ifndef VARINT_H +#define VARINT_H + +#include "git-compat-util.h" + +extern int encode_varint(uintmax_t, unsigned char *); +extern uintmax_t decode_varint(const unsigned char **); + +#endif /* VARINT_H */ From db3b313c84522d89ac358187a7f6928dfcfb1e7d Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 3 Apr 2012 15:53:09 -0700 Subject: [PATCH 02/12] cache.h: hide on-disk index details The on-disk format of the index file is a detail whose implementation is neatly encapsulated in read-cache.c; there is no need to expose it to the general public that include the cache.h header file. Also add a prominent mark to read-cache.c to delineate the parts that deal with the index file I/O routines from the remainder of the file. Signed-off-by: Junio C Hamano --- cache.h | 48 ---------------------------------------------- read-cache.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 48 deletions(-) diff --git a/cache.h b/cache.h index e5e1aa4e15..65a7aba158 100644 --- a/cache.h +++ b/cache.h @@ -115,48 +115,6 @@ struct cache_time { unsigned int nsec; }; -/* - * dev/ino/uid/gid/size are also just tracked to the low 32 bits - * Again - this is just a (very strong in practice) heuristic that - * the inode hasn't changed. - * - * We save the fields in big-endian order to allow using the - * index file over NFS transparently. - */ -struct ondisk_cache_entry { - struct cache_time ctime; - struct cache_time mtime; - unsigned int dev; - unsigned int ino; - unsigned int mode; - unsigned int uid; - unsigned int gid; - unsigned int size; - unsigned char sha1[20]; - unsigned short flags; - char name[FLEX_ARRAY]; /* more */ -}; - -/* - * This struct is used when CE_EXTENDED bit is 1 - * The struct must match ondisk_cache_entry exactly from - * ctime till flags - */ -struct ondisk_cache_entry_extended { - struct cache_time ctime; - struct cache_time mtime; - unsigned int dev; - unsigned int ino; - unsigned int mode; - unsigned int uid; - unsigned int gid; - unsigned int size; - unsigned char sha1[20]; - unsigned short flags; - unsigned short flags2; - char name[FLEX_ARRAY]; /* more */ -}; - struct cache_entry { struct cache_time ce_ctime; struct cache_time ce_mtime; @@ -253,9 +211,6 @@ static inline size_t ce_namelen(const struct cache_entry *ce) } #define ce_size(ce) cache_entry_size(ce_namelen(ce)) -#define ondisk_ce_size(ce) (((ce)->ce_flags & CE_EXTENDED) ? \ - ondisk_cache_entry_extended_size(ce_namelen(ce)) : \ - ondisk_cache_entry_size(ce_namelen(ce))) #define ce_stage(ce) ((CE_STAGEMASK & (ce)->ce_flags) >> CE_STAGESHIFT) #define ce_uptodate(ce) ((ce)->ce_flags & CE_UPTODATE) #define ce_skip_worktree(ce) ((ce)->ce_flags & CE_SKIP_WORKTREE) @@ -306,10 +261,7 @@ static inline unsigned int canon_mode(unsigned int mode) return S_IFGITLINK; } -#define flexible_size(STRUCT,len) ((offsetof(struct STRUCT,name) + (len) + 8) & ~7) #define cache_entry_size(len) (offsetof(struct cache_entry,name) + (len) + 1) -#define ondisk_cache_entry_size(len) flexible_size(ondisk_cache_entry,len) -#define ondisk_cache_entry_extended_size(len) flexible_size(ondisk_cache_entry_extended,len) struct index_state { struct cache_entry **cache; diff --git a/read-cache.c b/read-cache.c index 274e54b4f3..fa8aa7376b 100644 --- a/read-cache.c +++ b/read-cache.c @@ -1189,6 +1189,60 @@ static struct cache_entry *refresh_cache_entry(struct cache_entry *ce, int reall return refresh_cache_ent(&the_index, ce, really, NULL, NULL); } + +/***************************************************************** + * Index File I/O + *****************************************************************/ + +/* + * dev/ino/uid/gid/size are also just tracked to the low 32 bits + * Again - this is just a (very strong in practice) heuristic that + * the inode hasn't changed. + * + * We save the fields in big-endian order to allow using the + * index file over NFS transparently. + */ +struct ondisk_cache_entry { + struct cache_time ctime; + struct cache_time mtime; + unsigned int dev; + unsigned int ino; + unsigned int mode; + unsigned int uid; + unsigned int gid; + unsigned int size; + unsigned char sha1[20]; + unsigned short flags; + char name[FLEX_ARRAY]; /* more */ +}; + +/* + * This struct is used when CE_EXTENDED bit is 1 + * The struct must match ondisk_cache_entry exactly from + * ctime till flags + */ +struct ondisk_cache_entry_extended { + struct cache_time ctime; + struct cache_time mtime; + unsigned int dev; + unsigned int ino; + unsigned int mode; + unsigned int uid; + unsigned int gid; + unsigned int size; + unsigned char sha1[20]; + unsigned short flags; + unsigned short flags2; + char name[FLEX_ARRAY]; /* more */ +}; + +#define align_flex_name(STRUCT,len) ((offsetof(struct STRUCT,name) + (len) + 8) & ~7) +#define ondisk_cache_entry_size(len) align_flex_name(ondisk_cache_entry,len) +#define ondisk_cache_entry_extended_size(len) align_flex_name(ondisk_cache_entry_extended,len) +#define ondisk_ce_size(ce) (((ce)->ce_flags & CE_EXTENDED) ? \ + ondisk_cache_entry_extended_size(ce_namelen(ce)) : \ + ondisk_cache_entry_size(ce_namelen(ce))) + static int verify_hdr(struct cache_header *hdr, unsigned long size) { git_SHA_CTX c; From d60c49c2d7f683cd24a738533846cd3e34f2073e Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 3 Apr 2012 15:53:10 -0700 Subject: [PATCH 03/12] read-cache.c: allow unaligned mapping of the index file Both the on-disk format v2 and v3 pads the "name" field to the multiple of eight to make sure that various quantities in network long/short type can be accessed with ntohl/ntohs without having to worry about alignment, but this forces us to waste disk I/O bandwidth. Introduce ntoh_s()/ntoh_l() macros that the callers can use as if they were the regular ntohs()/ntohl() on a field that may not be aligned correctly. Signed-off-by: Junio C Hamano --- read-cache.c | 44 ++++++++++++++++++++++++++++++++------------ 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/read-cache.c b/read-cache.c index fa8aa7376b..d8865f5175 100644 --- a/read-cache.c +++ b/read-cache.c @@ -1285,6 +1285,26 @@ int read_index(struct index_state *istate) return read_index_from(istate, get_index_file()); } +#ifndef NEEDS_ALIGNED_ACCESS +#define ntoh_s(var) ntohs(var) +#define ntoh_l(var) ntohl(var) +#else +static inline uint16_t ntoh_s_force_align(void *p) +{ + uint16_t x; + memcpy(&x, p, sizeof(x)); + return ntohs(x); +} +static inline uint32_t ntoh_l_force_align(void *p) +{ + uint32_t x; + memcpy(&x, p, sizeof(x)); + return ntohl(x); +} +#define ntoh_s(var) ntoh_s_force_align(&(var)) +#define ntoh_l(var) ntoh_l_force_align(&(var)) +#endif + static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk) { struct cache_entry *ce; @@ -1293,14 +1313,14 @@ static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk) unsigned int flags; /* On-disk flags are just 16 bits */ - flags = ntohs(ondisk->flags); + flags = ntoh_s(ondisk->flags); len = flags & CE_NAMEMASK; if (flags & CE_EXTENDED) { struct ondisk_cache_entry_extended *ondisk2; int extended_flags; ondisk2 = (struct ondisk_cache_entry_extended *)ondisk; - extended_flags = ntohs(ondisk2->flags2) << 16; + extended_flags = ntoh_s(ondisk2->flags2) << 16; /* We do not yet understand any bit out of CE_EXTENDED_FLAGS */ if (extended_flags & ~CE_EXTENDED_FLAGS) die("Unknown index entry format %08x", extended_flags); @@ -1315,16 +1335,16 @@ static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk) ce = xmalloc(cache_entry_size(len)); - ce->ce_ctime.sec = ntohl(ondisk->ctime.sec); - ce->ce_mtime.sec = ntohl(ondisk->mtime.sec); - ce->ce_ctime.nsec = ntohl(ondisk->ctime.nsec); - ce->ce_mtime.nsec = ntohl(ondisk->mtime.nsec); - ce->ce_dev = ntohl(ondisk->dev); - ce->ce_ino = ntohl(ondisk->ino); - ce->ce_mode = ntohl(ondisk->mode); - ce->ce_uid = ntohl(ondisk->uid); - ce->ce_gid = ntohl(ondisk->gid); - ce->ce_size = ntohl(ondisk->size); + ce->ce_ctime.sec = ntoh_l(ondisk->ctime.sec); + ce->ce_mtime.sec = ntoh_l(ondisk->mtime.sec); + ce->ce_ctime.nsec = ntoh_l(ondisk->ctime.nsec); + ce->ce_mtime.nsec = ntoh_l(ondisk->mtime.nsec); + ce->ce_dev = ntoh_l(ondisk->dev); + ce->ce_ino = ntoh_l(ondisk->ino); + ce->ce_mode = ntoh_l(ondisk->mode); + ce->ce_uid = ntoh_l(ondisk->uid); + ce->ce_gid = ntoh_l(ondisk->gid); + ce->ce_size = ntoh_l(ondisk->size); ce->ce_flags = flags; hashcpy(ce->sha1, ondisk->sha1); From 936f53d055c9a336e9ad89ad06c4efd56e5896e8 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 3 Apr 2012 15:53:11 -0700 Subject: [PATCH 04/12] read-cache.c: make create_from_disk() report number of bytes it consumed The function is the one that is reading from the data stream. It only is natural to make it responsible for reporting this number, not the caller. Signed-off-by: Junio C Hamano --- read-cache.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/read-cache.c b/read-cache.c index d8865f5175..58bfb2481d 100644 --- a/read-cache.c +++ b/read-cache.c @@ -1305,7 +1305,8 @@ static inline uint32_t ntoh_l_force_align(void *p) #define ntoh_l(var) ntoh_l_force_align(&(var)) #endif -static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk) +static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk, + unsigned long *ent_size) { struct cache_entry *ce; size_t len; @@ -1351,6 +1352,7 @@ static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk) memcpy(ce->name, name, len); ce->name[len] = '\0'; + *ent_size = ondisk_ce_size(ce); return ce; } @@ -1404,12 +1406,13 @@ int read_index_from(struct index_state *istate, const char *path) for (i = 0; i < istate->cache_nr; i++) { struct ondisk_cache_entry *disk_ce; struct cache_entry *ce; + unsigned long consumed; disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset); - ce = create_from_disk(disk_ce); + ce = create_from_disk(disk_ce, &consumed); set_index_entry(istate, i, ce); - src_offset += ondisk_ce_size(ce); + src_offset += consumed; } istate->timestamp.sec = st.st_mtime; istate->timestamp.nsec = ST_MTIME_NSEC(st); From 0136bac9b8a449cdf2179ad0242e079262b0aeea Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 3 Apr 2012 15:53:12 -0700 Subject: [PATCH 05/12] read-cache.c: report the header version we do not understand Instead of just saying "bad index version", report the value we read from the disk. Signed-off-by: Junio C Hamano --- read-cache.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/read-cache.c b/read-cache.c index 58bfb2481d..2d938263ad 100644 --- a/read-cache.c +++ b/read-cache.c @@ -1247,11 +1247,13 @@ static int verify_hdr(struct cache_header *hdr, unsigned long size) { git_SHA_CTX c; unsigned char sha1[20]; + int hdr_version; if (hdr->hdr_signature != htonl(CACHE_SIGNATURE)) return error("bad signature"); - if (hdr->hdr_version != htonl(2) && hdr->hdr_version != htonl(3)) - return error("bad index version"); + hdr_version = ntohl(hdr->hdr_version); + if (hdr_version < 2 || 3 < hdr_version) + return error("bad index version %d", hdr_version); git_SHA1_Init(&c); git_SHA1_Update(&c, hdr, size - 20); git_SHA1_Final(sha1, &c); From 3fc22b53313ff035da145b2cb59e587ff3868654 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 3 Apr 2012 15:53:13 -0700 Subject: [PATCH 06/12] read-cache.c: move code to copy ondisk to incore cache to a helper function This makes the change in a later patch look less scary. Signed-off-by: Junio C Hamano --- read-cache.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/read-cache.c b/read-cache.c index 2d938263ad..82711c22bc 100644 --- a/read-cache.c +++ b/read-cache.c @@ -1307,6 +1307,30 @@ static inline uint32_t ntoh_l_force_align(void *p) #define ntoh_l(var) ntoh_l_force_align(&(var)) #endif +static struct cache_entry *cache_entry_from_ondisk(struct ondisk_cache_entry *ondisk, + unsigned int flags, + const char *name, + size_t len) +{ + struct cache_entry *ce = xmalloc(cache_entry_size(len)); + + ce->ce_ctime.sec = ntoh_l(ondisk->ctime.sec); + ce->ce_mtime.sec = ntoh_l(ondisk->mtime.sec); + ce->ce_ctime.nsec = ntoh_l(ondisk->ctime.nsec); + ce->ce_mtime.nsec = ntoh_l(ondisk->mtime.nsec); + ce->ce_dev = ntoh_l(ondisk->dev); + ce->ce_ino = ntoh_l(ondisk->ino); + ce->ce_mode = ntoh_l(ondisk->mode); + ce->ce_uid = ntoh_l(ondisk->uid); + ce->ce_gid = ntoh_l(ondisk->gid); + ce->ce_size = ntoh_l(ondisk->size); + ce->ce_flags = flags; + hashcpy(ce->sha1, ondisk->sha1); + memcpy(ce->name, name, len); + ce->name[len] = '\0'; + return ce; +} + static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk, unsigned long *ent_size) { @@ -1335,25 +1359,7 @@ static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk, if (len == CE_NAMEMASK) len = strlen(name); - - ce = xmalloc(cache_entry_size(len)); - - ce->ce_ctime.sec = ntoh_l(ondisk->ctime.sec); - ce->ce_mtime.sec = ntoh_l(ondisk->mtime.sec); - ce->ce_ctime.nsec = ntoh_l(ondisk->ctime.nsec); - ce->ce_mtime.nsec = ntoh_l(ondisk->mtime.nsec); - ce->ce_dev = ntoh_l(ondisk->dev); - ce->ce_ino = ntoh_l(ondisk->ino); - ce->ce_mode = ntoh_l(ondisk->mode); - ce->ce_uid = ntoh_l(ondisk->uid); - ce->ce_gid = ntoh_l(ondisk->gid); - ce->ce_size = ntoh_l(ondisk->size); - ce->ce_flags = flags; - - hashcpy(ce->sha1, ondisk->sha1); - - memcpy(ce->name, name, len); - ce->name[len] = '\0'; + ce = cache_entry_from_ondisk(ondisk, flags, name, len); *ent_size = ondisk_ce_size(ce); return ce; } From f136f7bfe816b46ebabf5439f8e55f37097ca353 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 3 Apr 2012 15:53:14 -0700 Subject: [PATCH 07/12] read-cache.c: move code to copy incore to ondisk cache to a helper function This makes the change in a later patch look less scary. Signed-off-by: Junio C Hamano --- read-cache.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/read-cache.c b/read-cache.c index 82711c22bc..c159351eaf 100644 --- a/read-cache.c +++ b/read-cache.c @@ -1605,13 +1605,10 @@ static void ce_smudge_racily_clean_entry(struct cache_entry *ce) } } -static int ce_write_entry(git_SHA_CTX *c, int fd, struct cache_entry *ce) +/* Copy miscellaneous fields but not the name */ +static char *copy_cache_entry_to_ondisk(struct ondisk_cache_entry *ondisk, + struct cache_entry *ce) { - int size = ondisk_ce_size(ce); - struct ondisk_cache_entry *ondisk = xcalloc(1, size); - char *name; - int result; - ondisk->ctime.sec = htonl(ce->ce_ctime.sec); ondisk->mtime.sec = htonl(ce->ce_mtime.sec); ondisk->ctime.nsec = htonl(ce->ce_ctime.nsec); @@ -1628,10 +1625,21 @@ static int ce_write_entry(git_SHA_CTX *c, int fd, struct cache_entry *ce) struct ondisk_cache_entry_extended *ondisk2; ondisk2 = (struct ondisk_cache_entry_extended *)ondisk; ondisk2->flags2 = htons((ce->ce_flags & CE_EXTENDED_FLAGS) >> 16); - name = ondisk2->name; + return ondisk2->name; } - else - name = ondisk->name; + else { + return ondisk->name; + } +} + +static int ce_write_entry(git_SHA_CTX *c, int fd, struct cache_entry *ce) +{ + int size = ondisk_ce_size(ce); + struct ondisk_cache_entry *ondisk = xcalloc(1, size); + char *name; + int result; + + name = copy_cache_entry_to_ondisk(ondisk, ce); memcpy(name, ce->name, ce_namelen(ce)); result = ce_write(c, fd, ondisk, size); From 6c9cd161d9d1bee349b1389d661282c92da098bc Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Tue, 3 Apr 2012 15:53:15 -0700 Subject: [PATCH 08/12] read-cache.c: read prefix-compressed names in index on-disk version v4 Because the entries are sorted by path, adjacent entries in the index tend to share the leading components of them, and it makes sense to only store the differences in later entries. In the v4 on-disk format of the index, each on-disk cache entry stores the number of bytes to be stripped from the end of the previous name, and the bytes to append to the result, to come up with its name. Signed-off-by: Junio C Hamano --- read-cache.c | 58 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/read-cache.c b/read-cache.c index c159351eaf..1c173f7a63 100644 --- a/read-cache.c +++ b/read-cache.c @@ -12,6 +12,8 @@ #include "commit.h" #include "blob.h" #include "resolve-undo.h" +#include "strbuf.h" +#include "varint.h" static struct cache_entry *refresh_cache_entry(struct cache_entry *ce, int really); @@ -1236,6 +1238,7 @@ struct ondisk_cache_entry_extended { char name[FLEX_ARRAY]; /* more */ }; +/* These are only used for v3 or lower */ #define align_flex_name(STRUCT,len) ((offsetof(struct STRUCT,name) + (len) + 8) & ~7) #define ondisk_cache_entry_size(len) align_flex_name(ondisk_cache_entry,len) #define ondisk_cache_entry_extended_size(len) align_flex_name(ondisk_cache_entry_extended,len) @@ -1252,7 +1255,7 @@ static int verify_hdr(struct cache_header *hdr, unsigned long size) if (hdr->hdr_signature != htonl(CACHE_SIGNATURE)) return error("bad signature"); hdr_version = ntohl(hdr->hdr_version); - if (hdr_version < 2 || 3 < hdr_version) + if (hdr_version < 2 || 4 < hdr_version) return error("bad index version %d", hdr_version); git_SHA1_Init(&c); git_SHA1_Update(&c, hdr, size - 20); @@ -1331,8 +1334,30 @@ static struct cache_entry *cache_entry_from_ondisk(struct ondisk_cache_entry *on return ce; } +/* + * Adjacent cache entries tend to share the leading paths, so it makes + * sense to only store the differences in later entries. In the v4 + * on-disk format of the index, each on-disk cache entry stores the + * number of bytes to be stripped from the end of the previous name, + * and the bytes to append to the result, to come up with its name. + */ +static unsigned long expand_name_field(struct strbuf *name, const char *cp_) +{ + const unsigned char *ep, *cp = (const unsigned char *)cp_; + size_t len = decode_varint(&cp); + + if (name->len < len) + die("malformed name field in the index"); + strbuf_remove(name, name->len - len, len); + for (ep = cp; *ep; ep++) + ; /* find the end */ + strbuf_add(name, cp, ep - cp); + return (const char *)ep + 1 - cp_; +} + static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk, - unsigned long *ent_size) + unsigned long *ent_size, + struct strbuf *previous_name) { struct cache_entry *ce; size_t len; @@ -1357,10 +1382,22 @@ static struct cache_entry *create_from_disk(struct ondisk_cache_entry *ondisk, else name = ondisk->name; - if (len == CE_NAMEMASK) - len = strlen(name); - ce = cache_entry_from_ondisk(ondisk, flags, name, len); - *ent_size = ondisk_ce_size(ce); + if (!previous_name) { + /* v3 and earlier */ + if (len == CE_NAMEMASK) + len = strlen(name); + ce = cache_entry_from_ondisk(ondisk, flags, name, len); + + *ent_size = ondisk_ce_size(ce); + } else { + unsigned long consumed; + consumed = expand_name_field(previous_name, name); + ce = cache_entry_from_ondisk(ondisk, flags, + previous_name->buf, + previous_name->len); + + *ent_size = (name - ((char *)ondisk)) + consumed; + } return ce; } @@ -1373,6 +1410,7 @@ int read_index_from(struct index_state *istate, const char *path) struct cache_header *hdr; void *mmap; size_t mmap_size; + struct strbuf previous_name_buf = STRBUF_INIT, *previous_name; errno = EBUSY; if (istate->initialized) @@ -1410,6 +1448,11 @@ int read_index_from(struct index_state *istate, const char *path) istate->cache = xcalloc(istate->cache_alloc, sizeof(struct cache_entry *)); istate->initialized = 1; + if (hdr->hdr_version == htonl(4)) + previous_name = &previous_name_buf; + else + previous_name = NULL; + src_offset = sizeof(*hdr); for (i = 0; i < istate->cache_nr; i++) { struct ondisk_cache_entry *disk_ce; @@ -1417,11 +1460,12 @@ int read_index_from(struct index_state *istate, const char *path) unsigned long consumed; disk_ce = (struct ondisk_cache_entry *)((char *)mmap + src_offset); - ce = create_from_disk(disk_ce, &consumed); + ce = create_from_disk(disk_ce, &consumed, previous_name); set_index_entry(istate, i, ce); src_offset += consumed; } + strbuf_release(&previous_name_buf); istate->timestamp.sec = st.st_mtime; istate->timestamp.nsec = ST_MTIME_NSEC(st); From 9d227781b688707f64ee1703a1156b0c83247c33 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Wed, 4 Apr 2012 09:12:43 -0700 Subject: [PATCH 09/12] read-cache.c: write prefix-compressed names in the index Teach the code to write the index in the v4 on-disk format. Record the format version of the on-disk index we read from in the index_state, and use the format when writing the new index out. Signed-off-by: Junio C Hamano --- cache.h | 4 ++++ read-cache.c | 64 ++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 58 insertions(+), 10 deletions(-) diff --git a/cache.h b/cache.h index 65a7aba158..a3f1279a3e 100644 --- a/cache.h +++ b/cache.h @@ -105,6 +105,9 @@ struct cache_header { unsigned int hdr_entries; }; +#define INDEX_FORMAT_LB 2 +#define INDEX_FORMAT_UB 4 + /* * The "cache_time" is just the low 32 bits of the * time. It doesn't matter if it overflows - we only @@ -265,6 +268,7 @@ static inline unsigned int canon_mode(unsigned int mode) struct index_state { struct cache_entry **cache; + unsigned int version; unsigned int cache_nr, cache_alloc, cache_changed; struct string_list *resolve_undo; struct cache_tree *cache_tree; diff --git a/read-cache.c b/read-cache.c index 1c173f7a63..adda1daf03 100644 --- a/read-cache.c +++ b/read-cache.c @@ -1196,6 +1196,8 @@ static struct cache_entry *refresh_cache_entry(struct cache_entry *ce, int reall * Index File I/O *****************************************************************/ +#define INDEX_FORMAT_DEFAULT 3 + /* * dev/ino/uid/gid/size are also just tracked to the low 32 bits * Again - this is just a (very strong in practice) heuristic that @@ -1443,12 +1445,13 @@ int read_index_from(struct index_state *istate, const char *path) if (verify_hdr(hdr, mmap_size) < 0) goto unmap; + istate->version = ntohl(hdr->hdr_version); istate->cache_nr = ntohl(hdr->hdr_entries); istate->cache_alloc = alloc_nr(istate->cache_nr); istate->cache = xcalloc(istate->cache_alloc, sizeof(struct cache_entry *)); istate->initialized = 1; - if (hdr->hdr_version == htonl(4)) + if (istate->version == 4) previous_name = &previous_name_buf; else previous_name = NULL; @@ -1676,15 +1679,45 @@ static char *copy_cache_entry_to_ondisk(struct ondisk_cache_entry *ondisk, } } -static int ce_write_entry(git_SHA_CTX *c, int fd, struct cache_entry *ce) +static int ce_write_entry(git_SHA_CTX *c, int fd, struct cache_entry *ce, + struct strbuf *previous_name) { - int size = ondisk_ce_size(ce); - struct ondisk_cache_entry *ondisk = xcalloc(1, size); + int size; + struct ondisk_cache_entry *ondisk; char *name; int result; - name = copy_cache_entry_to_ondisk(ondisk, ce); - memcpy(name, ce->name, ce_namelen(ce)); + if (!previous_name) { + size = ondisk_ce_size(ce); + ondisk = xcalloc(1, size); + name = copy_cache_entry_to_ondisk(ondisk, ce); + memcpy(name, ce->name, ce_namelen(ce)); + } else { + int common, to_remove, prefix_size; + unsigned char to_remove_vi[16]; + for (common = 0; + (ce->name[common] && + common < previous_name->len && + ce->name[common] == previous_name->buf[common]); + common++) + ; /* still matching */ + to_remove = previous_name->len - common; + prefix_size = encode_varint(to_remove, to_remove_vi); + + if (ce->ce_flags & CE_EXTENDED) + size = offsetof(struct ondisk_cache_entry_extended, name); + else + size = offsetof(struct ondisk_cache_entry, name); + size += prefix_size + (ce_namelen(ce) - common + 1); + + ondisk = xcalloc(1, size); + name = copy_cache_entry_to_ondisk(ondisk, ce); + memcpy(name, to_remove_vi, prefix_size); + memcpy(name + prefix_size, ce->name + common, ce_namelen(ce) - common); + + strbuf_splice(previous_name, common, to_remove, + ce->name + common, ce_namelen(ce) - common); + } result = ce_write(c, fd, ondisk, size); free(ondisk); @@ -1720,10 +1753,11 @@ int write_index(struct index_state *istate, int newfd) { git_SHA_CTX c; struct cache_header hdr; - int i, err, removed, extended; + int i, err, removed, extended, hdr_version; struct cache_entry **cache = istate->cache; int entries = istate->cache_nr; struct stat st; + struct strbuf previous_name_buf = STRBUF_INIT, *previous_name; for (i = removed = extended = 0; i < entries; i++) { if (cache[i]->ce_flags & CE_REMOVE) @@ -1737,24 +1771,34 @@ int write_index(struct index_state *istate, int newfd) } } + if (!istate->version) + istate->version = INDEX_FORMAT_DEFAULT; + + /* demote version 3 to version 2 when the latter suffices */ + if (istate->version == 3 || istate->version == 2) + istate->version = extended ? 3 : 2; + + hdr_version = istate->version; + hdr.hdr_signature = htonl(CACHE_SIGNATURE); - /* for extended format, increase version so older git won't try to read it */ - hdr.hdr_version = htonl(extended ? 3 : 2); + hdr.hdr_version = htonl(hdr_version); hdr.hdr_entries = htonl(entries - removed); git_SHA1_Init(&c); if (ce_write(&c, newfd, &hdr, sizeof(hdr)) < 0) return -1; + previous_name = (hdr_version == 4) ? &previous_name_buf : NULL; for (i = 0; i < entries; i++) { struct cache_entry *ce = cache[i]; if (ce->ce_flags & CE_REMOVE) continue; if (!ce_uptodate(ce) && is_racy_timestamp(istate, ce)) ce_smudge_racily_clean_entry(ce); - if (ce_write_entry(&c, newfd, ce) < 0) + if (ce_write_entry(&c, newfd, ce, previous_name) < 0) return -1; } + strbuf_release(&previous_name_buf); /* Write extension data here */ if (istate->cache_tree) { From 69dec66b2f6ab5c8138cca37e74bd405c541318a Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Wed, 4 Apr 2012 09:37:02 -0700 Subject: [PATCH 10/12] update-index: upgrade/downgrade on-disk index version With the "--index-version " parameter, write the index out in the specified version. With this, an index file that is written in newer format (say v4) can be downgraded to be read by older versions of Git. Signed-off-by: Junio C Hamano --- Documentation/git-update-index.txt | 6 +++++- builtin/update-index.c | 14 ++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/Documentation/git-update-index.txt b/Documentation/git-update-index.txt index a3081f4e23..9d0b1515c5 100644 --- a/Documentation/git-update-index.txt +++ b/Documentation/git-update-index.txt @@ -19,7 +19,7 @@ SYNOPSIS [--ignore-submodules] [--really-refresh] [--unresolve] [--again | -g] [--info-only] [--index-info] - [-z] [--stdin] + [-z] [--stdin] [--index-version ] [--verbose] [--] [...] @@ -143,6 +143,10 @@ you will need to handle the situation manually. --verbose:: Report what is being added and removed from index. +--index-version :: + Write the resulting index out in the named on-disk format version. + The current default version is 2. + -z:: Only meaningful with `--stdin` or `--index-info`; paths are separated with NUL character instead of LF. diff --git a/builtin/update-index.c b/builtin/update-index.c index a6a23fa1f3..5f038d64da 100644 --- a/builtin/update-index.c +++ b/builtin/update-index.c @@ -708,6 +708,7 @@ int cmd_update_index(int argc, const char **argv, const char *prefix) int newfd, entries, has_errors = 0, line_termination = '\n'; int read_from_stdin = 0; int prefix_length = prefix ? strlen(prefix) : 0; + int preferred_index_format = 0; char set_executable_bit = 0; struct refresh_params refresh_args = {0, &has_errors}; int lock_error = 0; @@ -791,6 +792,8 @@ int cmd_update_index(int argc, const char **argv, const char *prefix) "(for porcelains) forget saved unresolved conflicts", PARSE_OPT_NOARG | PARSE_OPT_NONEG, resolve_undo_clear_callback}, + OPT_INTEGER(0, "index-version", &preferred_index_format, + "write index in this format"), OPT_END() }; @@ -851,6 +854,17 @@ int cmd_update_index(int argc, const char **argv, const char *prefix) } } argc = parse_options_end(&ctx); + if (preferred_index_format) { + if (preferred_index_format < INDEX_FORMAT_LB || + INDEX_FORMAT_UB < preferred_index_format) + die("index-version %d not in range: %d..%d", + preferred_index_format, + INDEX_FORMAT_LB, INDEX_FORMAT_UB); + + if (the_index.version != preferred_index_format) + active_cache_changed = 1; + the_index.version = preferred_index_format; + } if (read_from_stdin) { struct strbuf buf = STRBUF_INIT, nbuf = STRBUF_INIT; From 9170c7ab2889a295de49458d7bd37d82f1556cd6 Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 27 Apr 2012 15:58:13 -0700 Subject: [PATCH 11/12] unpack-trees: preserve the index file version of original Otherwise "git checkout $other_branch" (or even "git checkout HEAD") would end up writing the index out in the default format. Signed-off-by: Junio C Hamano --- unpack-trees.c | 1 + 1 file changed, 1 insertion(+) diff --git a/unpack-trees.c b/unpack-trees.c index 7c9ecf665d..2a037d6a42 100644 --- a/unpack-trees.c +++ b/unpack-trees.c @@ -1020,6 +1020,7 @@ int unpack_trees(unsigned len, struct tree_desc *t, struct unpack_trees_options o->result.initialized = 1; o->result.timestamp.sec = o->src_index->timestamp.sec; o->result.timestamp.nsec = o->src_index->timestamp.nsec; + o->result.version = o->src_index->version; o->merge_size = len; mark_all_ce_unused(o->src_index); From afd7bd22209c53ae4d3c73dd4bc4b225ec55e10a Mon Sep 17 00:00:00 2001 From: Junio C Hamano Date: Fri, 27 Apr 2012 16:02:45 -0700 Subject: [PATCH 12/12] index-v4: document the entry format Document the format so that others can learn from and build on top of the series. Signed-off-by: Junio C Hamano --- Documentation/technical/index-format.txt | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Documentation/technical/index-format.txt b/Documentation/technical/index-format.txt index 8930b3fabc..9d25b30178 100644 --- a/Documentation/technical/index-format.txt +++ b/Documentation/technical/index-format.txt @@ -113,9 +113,22 @@ GIT index format are encoded in 7-bit ASCII and the encoding cannot contain a NUL byte (iow, this is a UNIX pathname). + (Version 4) In version 4, the entry path name is prefix-compressed + relative to the path name for the previous entry (the very first + entry is encoded as if the path name for the previous entry is an + empty string). At the beginning of an entry, an integer N in the + variable width encoding (the same encoding as the offset is encoded + for OFS_DELTA pack entries; see pack-format.txt) is stored, followed + by a NUL-terminated string S. Removing N bytes from the end of the + path name for the previous entry, and replacing it with the string S + yields the path name for this entry. + 1-8 nul bytes as necessary to pad the entry to a multiple of eight bytes while keeping the name NUL-terminated. + (Version 4) In version 4, the padding after the pathname does not + exist. + == Extensions === Cached tree