pack-objects: better check_object() performances
With large amount of objects, check_object() is really trashing the pack
sliding map and the filesystem cache.  It has a completely random access
pattern especially with old objects where delta replay jumps back and
forth all over the pack.
This patch improves things by:
 1) sorting objects by their offset in pack before calling check_object()
    so the pack access pattern is linear;
 2) recording the object type at add_object_entry() time since it is
    already known in most cases;
 3) recording the pack offset even for preferred_base objects;
 4) avoid calling sha1_object_info() if all possible.
This limits pack accesses to the bare minimum and makes them perfectly
linear.
In the process check_object() was made more clear (to me at least).
Note: I thought about walking the sorted_by_offset list backward in
get_object_details() so if a pack happens to be larger than the available
file cache, then the cache would have been populated with useful data from
the beginning of the pack already when find_deltas() is called.  Strangely,
testing (on Linux) showed absolutely no performance difference.
Signed-off-by: Nicolas Pitre <nico@cam.org>
Signed-off-by: Junio C Hamano <junkio@cox.net>
			
			
				maint
			
			
		
							parent
							
								
									54dab52ae8
								
							
						
					
					
						commit
						5c49c11686
					
				|  | @ -813,7 +813,8 @@ static unsigned name_hash(const char *name) | |||
| 	return hash; | ||||
| } | ||||
|  | ||||
| static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclude) | ||||
| static int add_object_entry(const unsigned char *sha1, enum object_type type, | ||||
| 			    unsigned hash, int exclude) | ||||
| { | ||||
| 	struct object_entry *entry; | ||||
| 	struct packed_git *p, *found_pack = NULL; | ||||
|  | @ -831,19 +832,19 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud | |||
| 		return 0; | ||||
| 	} | ||||
|  | ||||
| 	if (!exclude) { | ||||
| 		for (p = packed_git; p; p = p->next) { | ||||
| 			off_t offset = find_pack_entry_one(sha1, p); | ||||
| 			if (offset) { | ||||
| 				if (incremental) | ||||
| 					return 0; | ||||
| 				if (local && !p->pack_local) | ||||
| 					return 0; | ||||
| 				if (!found_pack) { | ||||
| 					found_offset = offset; | ||||
| 					found_pack = p; | ||||
| 				} | ||||
| 	for (p = packed_git; p; p = p->next) { | ||||
| 		off_t offset = find_pack_entry_one(sha1, p); | ||||
| 		if (offset) { | ||||
| 			if (!found_pack) { | ||||
| 				found_offset = offset; | ||||
| 				found_pack = p; | ||||
| 			} | ||||
| 			if (exclude) | ||||
| 				break; | ||||
| 			if (incremental) | ||||
| 				return 0; | ||||
| 			if (local && !p->pack_local) | ||||
| 				return 0; | ||||
| 		} | ||||
| 	} | ||||
|  | ||||
|  | @ -856,6 +857,8 @@ static int add_object_entry(const unsigned char *sha1, unsigned hash, int exclud | |||
| 	memset(entry, 0, sizeof(*entry)); | ||||
| 	hashcpy(entry->sha1, sha1); | ||||
| 	entry->hash = hash; | ||||
| 	if (type) | ||||
| 		entry->type = type; | ||||
| 	if (exclude) | ||||
| 		entry->preferred_base = 1; | ||||
| 	else | ||||
|  | @ -1008,7 +1011,9 @@ static void add_pbase_object(struct tree_desc *tree, | |||
| 			return; | ||||
| 		if (name[cmplen] != '/') { | ||||
| 			unsigned hash = name_hash(fullname); | ||||
| 			add_object_entry(entry.sha1, hash, 1); | ||||
| 			add_object_entry(entry.sha1, | ||||
| 					 S_ISDIR(entry.mode) ? OBJ_TREE : OBJ_BLOB, | ||||
| 					 hash, 1); | ||||
| 			return; | ||||
| 		} | ||||
| 		if (S_ISDIR(entry.mode)) { | ||||
|  | @ -1079,7 +1084,7 @@ static void add_preferred_base_object(const char *name, unsigned hash) | |||
| 	cmplen = name_cmp_len(name); | ||||
| 	for (it = pbase_tree; it; it = it->next) { | ||||
| 		if (cmplen == 0) { | ||||
| 			add_object_entry(it->pcache.sha1, 0, 1); | ||||
| 			add_object_entry(it->pcache.sha1, OBJ_TREE, 0, 1); | ||||
| 		} | ||||
| 		else { | ||||
| 			struct tree_desc tree; | ||||
|  | @ -1121,87 +1126,105 @@ static void add_preferred_base(unsigned char *sha1) | |||
|  | ||||
| static void check_object(struct object_entry *entry) | ||||
| { | ||||
| 	if (entry->in_pack && !entry->preferred_base) { | ||||
| 	if (entry->in_pack) { | ||||
| 		struct packed_git *p = entry->in_pack; | ||||
| 		struct pack_window *w_curs = NULL; | ||||
| 		unsigned long size, used; | ||||
| 		const unsigned char *base_ref = NULL; | ||||
| 		struct object_entry *base_entry; | ||||
| 		unsigned long used, used_0; | ||||
| 		unsigned int avail; | ||||
| 		unsigned char *buf; | ||||
| 		struct object_entry *base_entry = NULL; | ||||
| 		off_t ofs; | ||||
| 		unsigned char *buf, c; | ||||
|  | ||||
| 		buf = use_pack(p, &w_curs, entry->in_pack_offset, &avail); | ||||
|  | ||||
| 		/* We want in_pack_type even if we do not reuse delta. | ||||
| 		/* | ||||
| 		 * We want in_pack_type even if we do not reuse delta. | ||||
| 		 * There is no point not reusing non-delta representations. | ||||
| 		 */ | ||||
| 		used = unpack_object_header_gently(buf, avail, | ||||
| 						   &entry->in_pack_type, &size); | ||||
| 						   &entry->in_pack_type, | ||||
| 						   &entry->size); | ||||
|  | ||||
| 		/* Check if it is delta, and the base is also an object | ||||
| 		 * we are going to pack.  If so we will reuse the existing | ||||
| 		 * delta. | ||||
| 		/* | ||||
| 		 * Determine if this is a delta and if so whether we can | ||||
| 		 * reuse it or not.  Otherwise let's find out as cheaply as | ||||
| 		 * possible what the actual type and size for this object is. | ||||
| 		 */ | ||||
| 		if (!no_reuse_delta) { | ||||
| 			unsigned char c; | ||||
| 			const unsigned char *base_name; | ||||
| 			off_t ofs; | ||||
| 			unsigned long used_0; | ||||
| 			/* there is at least 20 bytes left in the pack */ | ||||
| 			switch (entry->in_pack_type) { | ||||
| 			case OBJ_REF_DELTA: | ||||
| 				base_name = use_pack(p, &w_curs, | ||||
| 					entry->in_pack_offset + used, NULL); | ||||
| 				used += 20; | ||||
| 				break; | ||||
| 			case OBJ_OFS_DELTA: | ||||
| 				buf = use_pack(p, &w_curs, | ||||
| 					entry->in_pack_offset + used, NULL); | ||||
| 				used_0 = 0; | ||||
| 				c = buf[used_0++]; | ||||
| 				ofs = c & 127; | ||||
| 				while (c & 128) { | ||||
| 					ofs += 1; | ||||
| 					if (!ofs || MSB(ofs, 7)) | ||||
| 						die("delta base offset overflow in pack for %s", | ||||
| 						    sha1_to_hex(entry->sha1)); | ||||
| 					c = buf[used_0++]; | ||||
| 					ofs = (ofs << 7) + (c & 127); | ||||
| 				} | ||||
| 				if (ofs >= entry->in_pack_offset) | ||||
| 					die("delta base offset out of bound for %s", | ||||
| 					    sha1_to_hex(entry->sha1)); | ||||
| 				ofs = entry->in_pack_offset - ofs; | ||||
| 				base_name = find_packed_object_name(p, ofs); | ||||
| 				used += used_0; | ||||
| 				break; | ||||
| 			default: | ||||
| 				base_name = NULL; | ||||
| 			} | ||||
| 			if (base_name) | ||||
| 				base_entry = locate_object_entry(base_name); | ||||
| 		} | ||||
| 		unuse_pack(&w_curs); | ||||
| 		entry->in_pack_header_size = used; | ||||
|  | ||||
| 		if (base_entry) { | ||||
|  | ||||
| 			/* Depth value does not matter - find_deltas() | ||||
| 			 * will never consider reused delta as the | ||||
| 			 * base object to deltify other objects | ||||
| 			 * against, in order to avoid circular deltas. | ||||
| 			 */ | ||||
|  | ||||
| 			/* uncompressed size of the delta data */ | ||||
| 			entry->size = size; | ||||
| 			entry->delta = base_entry; | ||||
| 		switch (entry->in_pack_type) { | ||||
| 		default: | ||||
| 			/* Not a delta hence we've already got all we need. */ | ||||
| 			entry->type = entry->in_pack_type; | ||||
| 			entry->in_pack_header_size = used; | ||||
| 			unuse_pack(&w_curs); | ||||
| 			return; | ||||
| 		case OBJ_REF_DELTA: | ||||
| 			if (!no_reuse_delta && !entry->preferred_base) | ||||
| 				base_ref = use_pack(p, &w_curs, | ||||
| 						entry->in_pack_offset + used, NULL); | ||||
| 			entry->in_pack_header_size = used + 20; | ||||
| 			break; | ||||
| 		case OBJ_OFS_DELTA: | ||||
| 			buf = use_pack(p, &w_curs, | ||||
| 				       entry->in_pack_offset + used, NULL); | ||||
| 			used_0 = 0; | ||||
| 			c = buf[used_0++]; | ||||
| 			ofs = c & 127; | ||||
| 			while (c & 128) { | ||||
| 				ofs += 1; | ||||
| 				if (!ofs || MSB(ofs, 7)) | ||||
| 					die("delta base offset overflow in pack for %s", | ||||
| 					    sha1_to_hex(entry->sha1)); | ||||
| 				c = buf[used_0++]; | ||||
| 				ofs = (ofs << 7) + (c & 127); | ||||
| 			} | ||||
| 			if (ofs >= entry->in_pack_offset) | ||||
| 				die("delta base offset out of bound for %s", | ||||
| 				    sha1_to_hex(entry->sha1)); | ||||
| 			ofs = entry->in_pack_offset - ofs; | ||||
| 			if (!no_reuse_delta && !entry->preferred_base) | ||||
| 				base_ref = find_packed_object_name(p, ofs); | ||||
| 			entry->in_pack_header_size = used + used_0; | ||||
| 			break; | ||||
| 		} | ||||
|  | ||||
| 		if (base_ref && (base_entry = locate_object_entry(base_ref))) { | ||||
| 			/* | ||||
| 			 * If base_ref was set above that means we wish to | ||||
| 			 * reuse delta data, and we even found that base | ||||
| 			 * in the list of objects we want to pack. Goodie! | ||||
| 			 * | ||||
| 			 * Depth value does not matter - find_deltas() will | ||||
| 			 * never consider reused delta as the base object to | ||||
| 			 * deltify other objects against, in order to avoid | ||||
| 			 * circular deltas. | ||||
| 			 */ | ||||
| 			entry->type = entry->in_pack_type; | ||||
| 			entry->delta = base_entry; | ||||
| 			entry->delta_sibling = base_entry->delta_child; | ||||
| 			base_entry->delta_child = entry; | ||||
|  | ||||
| 			unuse_pack(&w_curs); | ||||
| 			return; | ||||
| 		} | ||||
| 		/* Otherwise we would do the usual */ | ||||
|  | ||||
| 		if (entry->type) { | ||||
| 			/* | ||||
| 			 * This must be a delta and we already know what the | ||||
| 			 * final object type is.  Let's extract the actual | ||||
| 			 * object size from the delta header. | ||||
| 			 */ | ||||
| 			entry->size = get_size_from_delta(p, &w_curs, | ||||
| 					entry->in_pack_offset + entry->in_pack_header_size); | ||||
| 			unuse_pack(&w_curs); | ||||
| 			return; | ||||
| 		} | ||||
|  | ||||
| 		/* | ||||
| 		 * No choice but to fall back to the recursive delta walk | ||||
| 		 * with sha1_object_info() to find about the object type | ||||
| 		 * at this point... | ||||
| 		 */ | ||||
| 		unuse_pack(&w_curs); | ||||
| 	} | ||||
|  | ||||
| 	entry->type = sha1_object_info(entry->sha1, &entry->size); | ||||
|  | @ -1210,14 +1233,37 @@ static void check_object(struct object_entry *entry) | |||
| 		    sha1_to_hex(entry->sha1)); | ||||
| } | ||||
|  | ||||
| static int pack_offset_sort(const void *_a, const void *_b) | ||||
| { | ||||
| 	const struct object_entry *a = *(struct object_entry **)_a; | ||||
| 	const struct object_entry *b = *(struct object_entry **)_b; | ||||
|  | ||||
| 	/* avoid filesystem trashing with loose objects */ | ||||
| 	if (!a->in_pack && !b->in_pack) | ||||
| 		return hashcmp(a->sha1, b->sha1); | ||||
|  | ||||
| 	if (a->in_pack < b->in_pack) | ||||
| 		return -1; | ||||
| 	if (a->in_pack > b->in_pack) | ||||
| 		return 1; | ||||
| 	return a->in_pack_offset < b->in_pack_offset ? -1 : | ||||
| 			(a->in_pack_offset > b->in_pack_offset); | ||||
| } | ||||
|  | ||||
| static void get_object_details(void) | ||||
| { | ||||
| 	uint32_t i; | ||||
| 	struct object_entry *entry; | ||||
| 	struct object_entry **sorted_by_offset; | ||||
|  | ||||
| 	sorted_by_offset = xcalloc(nr_objects, sizeof(struct object_entry *)); | ||||
| 	for (i = 0; i < nr_objects; i++) | ||||
| 		sorted_by_offset[i] = objects + i; | ||||
| 	qsort(sorted_by_offset, nr_objects, sizeof(*sorted_by_offset), pack_offset_sort); | ||||
|  | ||||
| 	prepare_pack_ix(); | ||||
| 	for (i = 0, entry = objects; i < nr_objects; i++, entry++) | ||||
| 		check_object(entry); | ||||
| 	for (i = 0; i < nr_objects; i++) | ||||
| 		check_object(sorted_by_offset[i]); | ||||
| 	free(sorted_by_offset); | ||||
| } | ||||
|  | ||||
| static int type_size_sort(const void *_a, const void *_b) | ||||
|  | @ -1520,20 +1566,20 @@ static void read_object_list_from_stdin(void) | |||
|  | ||||
| 		hash = name_hash(line+41); | ||||
| 		add_preferred_base_object(line+41, hash); | ||||
| 		add_object_entry(sha1, hash, 0); | ||||
| 		add_object_entry(sha1, 0, hash, 0); | ||||
| 	} | ||||
| } | ||||
|  | ||||
| static void show_commit(struct commit *commit) | ||||
| { | ||||
| 	add_object_entry(commit->object.sha1, 0, 0); | ||||
| 	add_object_entry(commit->object.sha1, OBJ_COMMIT, 0, 0); | ||||
| } | ||||
|  | ||||
| static void show_object(struct object_array_entry *p) | ||||
| { | ||||
| 	unsigned hash = name_hash(p->name); | ||||
| 	add_preferred_base_object(p->name, hash); | ||||
| 	add_object_entry(p->item->sha1, hash, 0); | ||||
| 	add_object_entry(p->item->sha1, p->item->type, hash, 0); | ||||
| } | ||||
|  | ||||
| static void show_edge(struct commit *commit) | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue
	
	 Nicolas Pitre
						Nicolas Pitre