View Javadoc
1   /*
2    * Copyright (C) 2010, Google Inc. and others
3    *
4    * This program and the accompanying materials are made available under the
5    * terms of the Eclipse Distribution License v. 1.0 which is available at
6    * https://www.eclipse.org/org/documents/edl-v10.php.
7    *
8    * SPDX-License-Identifier: BSD-3-Clause
9    */
10  
11  package org.eclipse.jgit.diff;
12  
13  import java.util.ArrayList;
14  import java.util.List;
15  
16  /**
17   * An extended form of Bram Cohen's patience diff algorithm.
18   * <p>
19   * This implementation was derived by using the 4 rules that are outlined in
20   * Bram Cohen's <a href="http://bramcohen.livejournal.com/73318.html">blog</a>,
21   * and then was further extended to support low-occurrence common elements.
22   * <p>
23   * The basic idea of the algorithm is to create a histogram of occurrences for
24   * each element of sequence A. Each element of sequence B is then considered in
25   * turn. If the element also exists in sequence A, and has a lower occurrence
26   * count, the positions are considered as a candidate for the longest common
27   * subsequence (LCS). After scanning of B is complete the LCS that has the
28   * lowest number of occurrences is chosen as a split point. The region is split
29   * around the LCS, and the algorithm is recursively applied to the sections
30   * before and after the LCS.
31   * <p>
32   * By always selecting a LCS position with the lowest occurrence count, this
33   * algorithm behaves exactly like Bram Cohen's patience diff whenever there is a
34   * unique common element available between the two sequences. When no unique
35   * elements exist, the lowest occurrence element is chosen instead. This offers
36   * more readable diffs than simply falling back on the standard Myers' O(ND)
37   * algorithm would produce.
38   * <p>
39   * To prevent the algorithm from having an O(N^2) running time, an upper limit
40   * on the number of unique elements in a histogram bucket is configured by
41   * {@link #setMaxChainLength(int)}. If sequence A has more than this many
42   * elements that hash into the same hash bucket, the algorithm passes the region
43   * to {@link #setFallbackAlgorithm(DiffAlgorithm)}. If no fallback algorithm is
44   * configured, the region is emitted as a replace edit.
45   * <p>
46   * During scanning of sequence B, any element of A that occurs more than
47   * {@link #setMaxChainLength(int)} times is never considered for an LCS match
48   * position, even if it is common between the two sequences. This limits the
49   * number of locations in sequence A that must be considered to find the LCS,
50   * and helps maintain a lower running time bound.
51   * <p>
52   * So long as {@link #setMaxChainLength(int)} is a small constant (such as 64),
53   * the algorithm runs in O(N * D) time, where N is the sum of the input lengths
54   * and D is the number of edits in the resulting EditList. If the supplied
55   * {@link org.eclipse.jgit.diff.SequenceComparator} has a good hash function,
56   * this implementation typically out-performs
57   * {@link org.eclipse.jgit.diff.MyersDiff}, even though its theoretical running
58   * time is the same.
59   * <p>
60   * This implementation has an internal limitation that prevents it from handling
61   * sequences with more than 268,435,456 (2^28) elements.
62   */
63  public class HistogramDiff extends LowLevelDiffAlgorithm {
64  	/** Algorithm to use when there are too many element occurrences. */
65  	DiffAlgorithm fallback = MyersDiff.INSTANCE;
66  
67  	/**
68  	 * Maximum number of positions to consider for a given element hash.
69  	 *
70  	 * All elements with the same hash are stored into a single chain. The chain
71  	 * size is capped to ensure search is linear time at O(len_A + len_B) rather
72  	 * than quadratic at O(len_A * len_B).
73  	 */
74  	int maxChainLength = 64;
75  
76  	/**
77  	 * Set the algorithm used when there are too many element occurrences.
78  	 *
79  	 * @param alg
80  	 *            the secondary algorithm. If null the region will be denoted as
81  	 *            a single REPLACE block.
82  	 */
83  	public void setFallbackAlgorithm(DiffAlgorithm alg) {
84  		fallback = alg;
85  	}
86  
87  	/**
88  	 * Maximum number of positions to consider for a given element hash.
89  	 *
90  	 * All elements with the same hash are stored into a single chain. The chain
91  	 * size is capped to ensure search is linear time at O(len_A + len_B) rather
92  	 * than quadratic at O(len_A * len_B).
93  	 *
94  	 * @param maxLen
95  	 *            new maximum length.
96  	 */
97  	public void setMaxChainLength(int maxLen) {
98  		maxChainLength = maxLen;
99  	}
100 
101 	/** {@inheritDoc} */
102 	@Override
103 	public <S extends Sequence> void diffNonCommon(EditList edits,
104 			HashedSequenceComparator<S> cmp, HashedSequence<S> a,
105 			HashedSequence<S> b, Edit region) {
106 		new State<>(edits, cmp, a, b).diffRegion(region);
107 	}
108 
109 	private class State<S extends Sequence> {
110 		private final HashedSequenceComparator<S> cmp;
111 		private final HashedSequence<S> a;
112 		private final HashedSequence<S> b;
113 		private final List<Edit> queue = new ArrayList<>();
114 
115 		/** Result edits we have determined that must be made to convert a to b. */
116 		final EditList edits;
117 
118 		State(EditList edits, HashedSequenceComparator<S> cmp,
119 				HashedSequence<S> a, HashedSequence<S> b) {
120 			this.cmp = cmp;
121 			this.a = a;
122 			this.b = b;
123 			this.edits = edits;
124 		}
125 
126 		void diffRegion(Edit r) {
127 			diffReplace(r);
128 			while (!queue.isEmpty())
129 				diff(queue.remove(queue.size() - 1));
130 		}
131 
132 		private void diffReplace(Edit r) {
133 			Edit lcs = new HistogramDiffIndex<>(maxChainLength, cmp, a, b, r)
134 					.findLongestCommonSequence();
135 			if (lcs != null) {
136 				// If we were given an edit, we can prove a result here.
137 				//
138 				if (lcs.isEmpty()) {
139 					// An empty edit indicates there is nothing in common.
140 					// Replace the entire region.
141 					//
142 					edits.add(r);
143 				} else {
144 					queue.add(r.after(lcs));
145 					queue.add(r.before(lcs));
146 				}
147 
148 			} else if (fallback instanceof LowLevelDiffAlgorithm) {
149 				LowLevelDiffAlgorithm fb = (LowLevelDiffAlgorithm) fallback;
150 				fb.diffNonCommon(edits, cmp, a, b, r);
151 
152 			} else if (fallback != null) {
153 				SubsequenceComparator<HashedSequence<S>> cs = subcmp();
154 				Subsequence<HashedSequence<S>> as = Subsequence.a(a, r);
155 				Subsequence<HashedSequence<S>> bs = Subsequence.b(b, r);
156 
157 				EditList res = fallback.diffNonCommon(cs, as, bs);
158 				edits.addAll(Subsequence.toBase(res, as, bs));
159 
160 			} else {
161 				edits.add(r);
162 			}
163 		}
164 
165 		private void diff(Edit r) {
166 			switch (r.getType()) {
167 			case INSERT:
168 			case DELETE:
169 				edits.add(r);
170 				break;
171 
172 			case REPLACE:
173 				if (r.getLengthA() == 1 && r.getLengthB() == 1)
174 					edits.add(r);
175 				else
176 					diffReplace(r);
177 				break;
178 
179 			case EMPTY:
180 			default:
181 				throw new IllegalStateException();
182 			}
183 		}
184 
185 		private SubsequenceComparator<HashedSequence<S>> subcmp() {
186 			return new SubsequenceComparator<>(cmp);
187 		}
188 	}
189 }