1 /* 2 * Copyright (C) 2010, Google Inc. and others 3 * 4 * This program and the accompanying materials are made available under the 5 * terms of the Eclipse Distribution License v. 1.0 which is available at 6 * https://www.eclipse.org/org/documents/edl-v10.php. 7 * 8 * SPDX-License-Identifier: BSD-3-Clause 9 */ 10 11 package org.eclipse.jgit.diff; 12 13 import java.util.ArrayList; 14 import java.util.List; 15 16 /** 17 * An extended form of Bram Cohen's patience diff algorithm. 18 * <p> 19 * This implementation was derived by using the 4 rules that are outlined in 20 * Bram Cohen's <a href="http://bramcohen.livejournal.com/73318.html">blog</a>, 21 * and then was further extended to support low-occurrence common elements. 22 * <p> 23 * The basic idea of the algorithm is to create a histogram of occurrences for 24 * each element of sequence A. Each element of sequence B is then considered in 25 * turn. If the element also exists in sequence A, and has a lower occurrence 26 * count, the positions are considered as a candidate for the longest common 27 * subsequence (LCS). After scanning of B is complete the LCS that has the 28 * lowest number of occurrences is chosen as a split point. The region is split 29 * around the LCS, and the algorithm is recursively applied to the sections 30 * before and after the LCS. 31 * <p> 32 * By always selecting a LCS position with the lowest occurrence count, this 33 * algorithm behaves exactly like Bram Cohen's patience diff whenever there is a 34 * unique common element available between the two sequences. When no unique 35 * elements exist, the lowest occurrence element is chosen instead. This offers 36 * more readable diffs than simply falling back on the standard Myers' O(ND) 37 * algorithm would produce. 38 * <p> 39 * To prevent the algorithm from having an O(N^2) running time, an upper limit 40 * on the number of unique elements in a histogram bucket is configured by 41 * {@link #setMaxChainLength(int)}. If sequence A has more than this many 42 * elements that hash into the same hash bucket, the algorithm passes the region 43 * to {@link #setFallbackAlgorithm(DiffAlgorithm)}. If no fallback algorithm is 44 * configured, the region is emitted as a replace edit. 45 * <p> 46 * During scanning of sequence B, any element of A that occurs more than 47 * {@link #setMaxChainLength(int)} times is never considered for an LCS match 48 * position, even if it is common between the two sequences. This limits the 49 * number of locations in sequence A that must be considered to find the LCS, 50 * and helps maintain a lower running time bound. 51 * <p> 52 * So long as {@link #setMaxChainLength(int)} is a small constant (such as 64), 53 * the algorithm runs in O(N * D) time, where N is the sum of the input lengths 54 * and D is the number of edits in the resulting EditList. If the supplied 55 * {@link org.eclipse.jgit.diff.SequenceComparator} has a good hash function, 56 * this implementation typically out-performs 57 * {@link org.eclipse.jgit.diff.MyersDiff}, even though its theoretical running 58 * time is the same. 59 * <p> 60 * This implementation has an internal limitation that prevents it from handling 61 * sequences with more than 268,435,456 (2^28) elements. 62 */ 63 public class HistogramDiff extends LowLevelDiffAlgorithm { 64 /** Algorithm to use when there are too many element occurrences. */ 65 DiffAlgorithm fallback = MyersDiff.INSTANCE; 66 67 /** 68 * Maximum number of positions to consider for a given element hash. 69 * 70 * All elements with the same hash are stored into a single chain. The chain 71 * size is capped to ensure search is linear time at O(len_A + len_B) rather 72 * than quadratic at O(len_A * len_B). 73 */ 74 int maxChainLength = 64; 75 76 /** 77 * Set the algorithm used when there are too many element occurrences. 78 * 79 * @param alg 80 * the secondary algorithm. If null the region will be denoted as 81 * a single REPLACE block. 82 */ 83 public void setFallbackAlgorithm(DiffAlgorithm alg) { 84 fallback = alg; 85 } 86 87 /** 88 * Maximum number of positions to consider for a given element hash. 89 * 90 * All elements with the same hash are stored into a single chain. The chain 91 * size is capped to ensure search is linear time at O(len_A + len_B) rather 92 * than quadratic at O(len_A * len_B). 93 * 94 * @param maxLen 95 * new maximum length. 96 */ 97 public void setMaxChainLength(int maxLen) { 98 maxChainLength = maxLen; 99 } 100 101 /** {@inheritDoc} */ 102 @Override 103 public <S extends Sequence> void diffNonCommon(EditList edits, 104 HashedSequenceComparator<S> cmp, HashedSequence<S> a, 105 HashedSequence<S> b, Edit region) { 106 new State<>(edits, cmp, a, b).diffRegion(region); 107 } 108 109 private class State<S extends Sequence> { 110 private final HashedSequenceComparator<S> cmp; 111 private final HashedSequence<S> a; 112 private final HashedSequence<S> b; 113 private final List<Edit> queue = new ArrayList<>(); 114 115 /** Result edits we have determined that must be made to convert a to b. */ 116 final EditList edits; 117 118 State(EditList edits, HashedSequenceComparator<S> cmp, 119 HashedSequence<S> a, HashedSequence<S> b) { 120 this.cmp = cmp; 121 this.a = a; 122 this.b = b; 123 this.edits = edits; 124 } 125 126 void diffRegion(Edit r) { 127 diffReplace(r); 128 while (!queue.isEmpty()) 129 diff(queue.remove(queue.size() - 1)); 130 } 131 132 private void diffReplace(Edit r) { 133 Edit lcs = new HistogramDiffIndex<>(maxChainLength, cmp, a, b, r) 134 .findLongestCommonSequence(); 135 if (lcs != null) { 136 // If we were given an edit, we can prove a result here. 137 // 138 if (lcs.isEmpty()) { 139 // An empty edit indicates there is nothing in common. 140 // Replace the entire region. 141 // 142 edits.add(r); 143 } else { 144 queue.add(r.after(lcs)); 145 queue.add(r.before(lcs)); 146 } 147 148 } else if (fallback instanceof LowLevelDiffAlgorithm) { 149 LowLevelDiffAlgorithm fb = (LowLevelDiffAlgorithm) fallback; 150 fb.diffNonCommon(edits, cmp, a, b, r); 151 152 } else if (fallback != null) { 153 SubsequenceComparator<HashedSequence<S>> cs = subcmp(); 154 Subsequence<HashedSequence<S>> as = Subsequence.a(a, r); 155 Subsequence<HashedSequence<S>> bs = Subsequence.b(b, r); 156 157 EditList res = fallback.diffNonCommon(cs, as, bs); 158 edits.addAll(Subsequence.toBase(res, as, bs)); 159 160 } else { 161 edits.add(r); 162 } 163 } 164 165 private void diff(Edit r) { 166 switch (r.getType()) { 167 case INSERT: 168 case DELETE: 169 edits.add(r); 170 break; 171 172 case REPLACE: 173 if (r.getLengthA() == 1 && r.getLengthB() == 1) 174 edits.add(r); 175 else 176 diffReplace(r); 177 break; 178 179 case EMPTY: 180 default: 181 throw new IllegalStateException(); 182 } 183 } 184 185 private SubsequenceComparator<HashedSequence<S>> subcmp() { 186 return new SubsequenceComparator<>(cmp); 187 } 188 } 189 }