View Javadoc
1   /*
2    * Copyright (C) 2008-2009, Google Inc.
3    * and other copyright owners as documented in the project's IP log.
4    *
5    * This program and the accompanying materials are made available
6    * under the terms of the Eclipse Distribution License v1.0 which
7    * accompanies this distribution, is reproduced below, and is
8    * available at http://www.eclipse.org/org/documents/edl-v10.php
9    *
10   * All rights reserved.
11   *
12   * Redistribution and use in source and binary forms, with or
13   * without modification, are permitted provided that the following
14   * conditions are met:
15   *
16   * - Redistributions of source code must retain the above copyright
17   *   notice, this list of conditions and the following disclaimer.
18   *
19   * - Redistributions in binary form must reproduce the above
20   *   copyright notice, this list of conditions and the following
21   *   disclaimer in the documentation and/or other materials provided
22   *   with the distribution.
23   *
24   * - Neither the name of the Eclipse Foundation, Inc. nor the
25   *   names of its contributors may be used to endorse or promote
26   *   products derived from this software without specific prior
27   *   written permission.
28   *
29   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
30   * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
31   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
32   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
33   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
34   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
35   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
36   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
37   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
38   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
39   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
40   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
41   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
42   */
43  
44  package org.eclipse.jgit.patch;
45  
46  import static org.eclipse.jgit.lib.Constants.encodeASCII;
47  import static org.eclipse.jgit.util.RawParseUtils.decode;
48  import static org.eclipse.jgit.util.RawParseUtils.decodeNoFallback;
49  import static org.eclipse.jgit.util.RawParseUtils.extractBinaryString;
50  import static org.eclipse.jgit.util.RawParseUtils.match;
51  import static org.eclipse.jgit.util.RawParseUtils.nextLF;
52  import static org.eclipse.jgit.util.RawParseUtils.parseBase10;
53  
54  import java.io.IOException;
55  import java.nio.charset.CharacterCodingException;
56  import java.nio.charset.Charset;
57  import java.text.MessageFormat;
58  import java.util.ArrayList;
59  import java.util.Collections;
60  import java.util.List;
61  
62  import org.eclipse.jgit.diff.DiffEntry;
63  import org.eclipse.jgit.diff.EditList;
64  import org.eclipse.jgit.internal.JGitText;
65  import org.eclipse.jgit.lib.AbbreviatedObjectId;
66  import org.eclipse.jgit.lib.Constants;
67  import org.eclipse.jgit.lib.FileMode;
68  import org.eclipse.jgit.util.QuotedString;
69  import org.eclipse.jgit.util.RawParseUtils;
70  import org.eclipse.jgit.util.TemporaryBuffer;
71  
72  /** Patch header describing an action for a single file path. */
73  public class FileHeader extends DiffEntry {
74  	private static final byte[] OLD_MODE = encodeASCII("old mode "); //$NON-NLS-1$
75  
76  	private static final byte[] NEW_MODE = encodeASCII("new mode "); //$NON-NLS-1$
77  
78  	static final byte[] DELETED_FILE_MODE = encodeASCII("deleted file mode "); //$NON-NLS-1$
79  
80  	static final byte[] NEW_FILE_MODE = encodeASCII("new file mode "); //$NON-NLS-1$
81  
82  	private static final byte[] COPY_FROM = encodeASCII("copy from "); //$NON-NLS-1$
83  
84  	private static final byte[] COPY_TO = encodeASCII("copy to "); //$NON-NLS-1$
85  
86  	private static final byte[] RENAME_OLD = encodeASCII("rename old "); //$NON-NLS-1$
87  
88  	private static final byte[] RENAME_NEW = encodeASCII("rename new "); //$NON-NLS-1$
89  
90  	private static final byte[] RENAME_FROM = encodeASCII("rename from "); //$NON-NLS-1$
91  
92  	private static final byte[] RENAME_TO = encodeASCII("rename to "); //$NON-NLS-1$
93  
94  	private static final byte[] SIMILARITY_INDEX = encodeASCII("similarity index "); //$NON-NLS-1$
95  
96  	private static final byte[] DISSIMILARITY_INDEX = encodeASCII("dissimilarity index "); //$NON-NLS-1$
97  
98  	static final byte[] INDEX = encodeASCII("index "); //$NON-NLS-1$
99  
100 	static final byte[] OLD_NAME = encodeASCII("--- "); //$NON-NLS-1$
101 
102 	static final byte[] NEW_NAME = encodeASCII("+++ "); //$NON-NLS-1$
103 
104 	/** Type of patch used by this file. */
105 	public static enum PatchType {
106 		/** A traditional unified diff style patch of a text file. */
107 		UNIFIED,
108 
109 		/** An empty patch with a message "Binary files ... differ" */
110 		BINARY,
111 
112 		/** A Git binary patch, holding pre and post image deltas */
113 		GIT_BINARY;
114 	}
115 
116 	/** Buffer holding the patch data for this file. */
117 	final byte[] buf;
118 
119 	/** Offset within {@link #buf} to the "diff ..." line. */
120 	final int startOffset;
121 
122 	/** Position 1 past the end of this file within {@link #buf}. */
123 	int endOffset;
124 
125 	/** Type of patch used to modify this file */
126 	PatchType patchType;
127 
128 	/** The hunks of this file */
129 	private List<HunkHeader> hunks;
130 
131 	/** If {@link #patchType} is {@link PatchType#GIT_BINARY}, the new image */
132 	BinaryHunk forwardBinaryHunk;
133 
134 	/** If {@link #patchType} is {@link PatchType#GIT_BINARY}, the old image */
135 	BinaryHunk reverseBinaryHunk;
136 
137 	/**
138 	 * Constructs a new FileHeader
139 	 *
140 	 * @param headerLines
141 	 *            buffer holding the diff header for this file
142 	 * @param edits
143 	 *            the edits for this file
144 	 * @param type
145 	 *            the type of patch used to modify this file
146 	 */
147 	public FileHeader(final byte[] headerLines, EditList edits, PatchType type) {
148 		this(headerLines, 0);
149 		endOffset = headerLines.length;
150 		int ptr = parseGitFileName(Patch.DIFF_GIT.length, headerLines.length);
151 		parseGitHeaders(ptr, headerLines.length);
152 		this.patchType = type;
153 		addHunk(new HunkHeader(this, edits));
154 	}
155 
156 	FileHeader(final byte[] b, final int offset) {
157 		buf = b;
158 		startOffset = offset;
159 		changeType = ChangeType.MODIFY; // unless otherwise designated
160 		patchType = PatchType.UNIFIED;
161 	}
162 
163 	int getParentCount() {
164 		return 1;
165 	}
166 
167 	/** @return the byte array holding this file's patch script. */
168 	public byte[] getBuffer() {
169 		return buf;
170 	}
171 
172 	/** @return offset the start of this file's script in {@link #getBuffer()}. */
173 	public int getStartOffset() {
174 		return startOffset;
175 	}
176 
177 	/** @return offset one past the end of the file script. */
178 	public int getEndOffset() {
179 		return endOffset;
180 	}
181 
182 	/**
183 	 * Convert the patch script for this file into a string.
184 	 * <p>
185 	 * The default character encoding ({@link Constants#CHARSET}) is assumed for
186 	 * both the old and new files.
187 	 *
188 	 * @return the patch script, as a Unicode string.
189 	 */
190 	public String getScriptText() {
191 		return getScriptText(null, null);
192 	}
193 
194 	/**
195 	 * Convert the patch script for this file into a string.
196 	 *
197 	 * @param oldCharset
198 	 *            hint character set to decode the old lines with.
199 	 * @param newCharset
200 	 *            hint character set to decode the new lines with.
201 	 * @return the patch script, as a Unicode string.
202 	 */
203 	public String getScriptText(Charset oldCharset, Charset newCharset) {
204 		return getScriptText(new Charset[] { oldCharset, newCharset });
205 	}
206 
207 	String getScriptText(Charset[] charsetGuess) {
208 		if (getHunks().isEmpty()) {
209 			// If we have no hunks then we can safely assume the entire
210 			// patch is a binary style patch, or a meta-data only style
211 			// patch. Either way the encoding of the headers should be
212 			// strictly 7-bit US-ASCII and the body is either 7-bit ASCII
213 			// (due to the base 85 encoding used for a BinaryHunk) or is
214 			// arbitrary noise we have chosen to ignore and not understand
215 			// (e.g. the message "Binary files ... differ").
216 			//
217 			return extractBinaryString(buf, startOffset, endOffset);
218 		}
219 
220 		if (charsetGuess != null && charsetGuess.length != getParentCount() + 1)
221 			throw new IllegalArgumentException(MessageFormat.format(
222 					JGitText.get().expectedCharacterEncodingGuesses,
223 					Integer.valueOf(getParentCount() + 1)));
224 
225 		if (trySimpleConversion(charsetGuess)) {
226 			Charset cs = charsetGuess != null ? charsetGuess[0] : null;
227 			if (cs == null)
228 				cs = Constants.CHARSET;
229 			try {
230 				return decodeNoFallback(cs, buf, startOffset, endOffset);
231 			} catch (CharacterCodingException cee) {
232 				// Try the much slower, more-memory intensive version which
233 				// can handle a character set conversion patch.
234 			}
235 		}
236 
237 		final StringBuilder r = new StringBuilder(endOffset - startOffset);
238 
239 		// Always treat the headers as US-ASCII; Git file names are encoded
240 		// in a C style escape if any character has the high-bit set.
241 		//
242 		final int hdrEnd = getHunks().get(0).getStartOffset();
243 		for (int ptr = startOffset; ptr < hdrEnd;) {
244 			final int eol = Math.min(hdrEnd, nextLF(buf, ptr));
245 			r.append(extractBinaryString(buf, ptr, eol));
246 			ptr = eol;
247 		}
248 
249 		final String[] files = extractFileLines(charsetGuess);
250 		final int[] offsets = new int[files.length];
251 		for (final HunkHeader h : getHunks())
252 			h.extractFileLines(r, files, offsets);
253 		return r.toString();
254 	}
255 
256 	private static boolean trySimpleConversion(final Charset[] charsetGuess) {
257 		if (charsetGuess == null)
258 			return true;
259 		for (int i = 1; i < charsetGuess.length; i++) {
260 			if (charsetGuess[i] != charsetGuess[0])
261 				return false;
262 		}
263 		return true;
264 	}
265 
266 	private String[] extractFileLines(final Charset[] csGuess) {
267 		final TemporaryBuffer[] tmp = new TemporaryBuffer[getParentCount() + 1];
268 		try {
269 			for (int i = 0; i < tmp.length; i++)
270 				tmp[i] = new TemporaryBuffer.Heap(Integer.MAX_VALUE);
271 			for (final HunkHeader h : getHunks())
272 				h.extractFileLines(tmp);
273 
274 			final String[] r = new String[tmp.length];
275 			for (int i = 0; i < tmp.length; i++) {
276 				Charset cs = csGuess != null ? csGuess[i] : null;
277 				if (cs == null)
278 					cs = Constants.CHARSET;
279 				r[i] = RawParseUtils.decode(cs, tmp[i].toByteArray());
280 			}
281 			return r;
282 		} catch (IOException ioe) {
283 			throw new RuntimeException(JGitText.get().cannotConvertScriptToText, ioe);
284 		}
285 	}
286 
287 	/** @return style of patch used to modify this file */
288 	public PatchType getPatchType() {
289 		return patchType;
290 	}
291 
292 	/** @return true if this patch modifies metadata about a file */
293 	public boolean hasMetaDataChanges() {
294 		return changeType != ChangeType.MODIFY || newMode != oldMode;
295 	}
296 
297 	/** @return hunks altering this file; in order of appearance in patch */
298 	public List<? extends HunkHeader> getHunks() {
299 		if (hunks == null)
300 			return Collections.emptyList();
301 		return hunks;
302 	}
303 
304 	void addHunk(final HunkHeader h) {
305 		if (h.getFileHeader() != this)
306 			throw new IllegalArgumentException(JGitText.get().hunkBelongsToAnotherFile);
307 		if (hunks == null)
308 			hunks = new ArrayList<HunkHeader>();
309 		hunks.add(h);
310 	}
311 
312 	HunkHeader newHunkHeader(final int offset) {
313 		return new HunkHeader(this, offset);
314 	}
315 
316 	/** @return if a {@link PatchType#GIT_BINARY}, the new-image delta/literal */
317 	public BinaryHunk getForwardBinaryHunk() {
318 		return forwardBinaryHunk;
319 	}
320 
321 	/** @return if a {@link PatchType#GIT_BINARY}, the old-image delta/literal */
322 	public BinaryHunk getReverseBinaryHunk() {
323 		return reverseBinaryHunk;
324 	}
325 
326 	/** @return a list describing the content edits performed on this file. */
327 	public EditList toEditList() {
328 		final EditList r = new EditList();
329 		for (final HunkHeader hunk : hunks)
330 			r.addAll(hunk.toEditList());
331 		return r;
332 	}
333 
334 	/**
335 	 * Parse a "diff --git" or "diff --cc" line.
336 	 *
337 	 * @param ptr
338 	 *            first character after the "diff --git " or "diff --cc " part.
339 	 * @param end
340 	 *            one past the last position to parse.
341 	 * @return first character after the LF at the end of the line; -1 on error.
342 	 */
343 	int parseGitFileName(int ptr, final int end) {
344 		final int eol = nextLF(buf, ptr);
345 		final int bol = ptr;
346 		if (eol >= end) {
347 			return -1;
348 		}
349 
350 		// buffer[ptr..eol] looks like "a/foo b/foo\n". After the first
351 		// A regex to match this is "^[^/]+/(.*?) [^/+]+/\1\n$". There
352 		// is only one way to split the line such that text to the left
353 		// of the space matches the text to the right, excluding the part
354 		// before the first slash.
355 		//
356 
357 		final int aStart = nextLF(buf, ptr, '/');
358 		if (aStart >= eol)
359 			return eol;
360 
361 		while (ptr < eol) {
362 			final int sp = nextLF(buf, ptr, ' ');
363 			if (sp >= eol) {
364 				// We can't split the header, it isn't valid.
365 				// This may be OK if this is a rename patch.
366 				//
367 				return eol;
368 			}
369 			final int bStart = nextLF(buf, sp, '/');
370 			if (bStart >= eol)
371 				return eol;
372 
373 			// If buffer[aStart..sp - 1] = buffer[bStart..eol - 1]
374 			// we have a valid split.
375 			//
376 			if (eq(aStart, sp - 1, bStart, eol - 1)) {
377 				if (buf[bol] == '"') {
378 					// We're a double quoted name. The region better end
379 					// in a double quote too, and we need to decode the
380 					// characters before reading the name.
381 					//
382 					if (buf[sp - 2] != '"') {
383 						return eol;
384 					}
385 					oldPath = QuotedString.GIT_PATH.dequote(buf, bol, sp - 1);
386 					oldPath = p1(oldPath);
387 				} else {
388 					oldPath = decode(Constants.CHARSET, buf, aStart, sp - 1);
389 				}
390 				newPath = oldPath;
391 				return eol;
392 			}
393 
394 			// This split wasn't correct. Move past the space and try
395 			// another split as the space must be part of the file name.
396 			//
397 			ptr = sp;
398 		}
399 
400 		return eol;
401 	}
402 
403 	int parseGitHeaders(int ptr, final int end) {
404 		while (ptr < end) {
405 			final int eol = nextLF(buf, ptr);
406 			if (isHunkHdr(buf, ptr, eol) >= 1) {
407 				// First hunk header; break out and parse them later.
408 				break;
409 
410 			} else if (match(buf, ptr, OLD_NAME) >= 0) {
411 				parseOldName(ptr, eol);
412 
413 			} else if (match(buf, ptr, NEW_NAME) >= 0) {
414 				parseNewName(ptr, eol);
415 
416 			} else if (match(buf, ptr, OLD_MODE) >= 0) {
417 				oldMode = parseFileMode(ptr + OLD_MODE.length, eol);
418 
419 			} else if (match(buf, ptr, NEW_MODE) >= 0) {
420 				newMode = parseFileMode(ptr + NEW_MODE.length, eol);
421 
422 			} else if (match(buf, ptr, DELETED_FILE_MODE) >= 0) {
423 				oldMode = parseFileMode(ptr + DELETED_FILE_MODE.length, eol);
424 				newMode = FileMode.MISSING;
425 				changeType = ChangeType.DELETE;
426 
427 			} else if (match(buf, ptr, NEW_FILE_MODE) >= 0) {
428 				parseNewFileMode(ptr, eol);
429 
430 			} else if (match(buf, ptr, COPY_FROM) >= 0) {
431 				oldPath = parseName(oldPath, ptr + COPY_FROM.length, eol);
432 				changeType = ChangeType.COPY;
433 
434 			} else if (match(buf, ptr, COPY_TO) >= 0) {
435 				newPath = parseName(newPath, ptr + COPY_TO.length, eol);
436 				changeType = ChangeType.COPY;
437 
438 			} else if (match(buf, ptr, RENAME_OLD) >= 0) {
439 				oldPath = parseName(oldPath, ptr + RENAME_OLD.length, eol);
440 				changeType = ChangeType.RENAME;
441 
442 			} else if (match(buf, ptr, RENAME_NEW) >= 0) {
443 				newPath = parseName(newPath, ptr + RENAME_NEW.length, eol);
444 				changeType = ChangeType.RENAME;
445 
446 			} else if (match(buf, ptr, RENAME_FROM) >= 0) {
447 				oldPath = parseName(oldPath, ptr + RENAME_FROM.length, eol);
448 				changeType = ChangeType.RENAME;
449 
450 			} else if (match(buf, ptr, RENAME_TO) >= 0) {
451 				newPath = parseName(newPath, ptr + RENAME_TO.length, eol);
452 				changeType = ChangeType.RENAME;
453 
454 			} else if (match(buf, ptr, SIMILARITY_INDEX) >= 0) {
455 				score = parseBase10(buf, ptr + SIMILARITY_INDEX.length, null);
456 
457 			} else if (match(buf, ptr, DISSIMILARITY_INDEX) >= 0) {
458 				score = parseBase10(buf, ptr + DISSIMILARITY_INDEX.length, null);
459 
460 			} else if (match(buf, ptr, INDEX) >= 0) {
461 				parseIndexLine(ptr + INDEX.length, eol);
462 
463 			} else {
464 				// Probably an empty patch (stat dirty).
465 				break;
466 			}
467 
468 			ptr = eol;
469 		}
470 		return ptr;
471 	}
472 
473 	void parseOldName(int ptr, final int eol) {
474 		oldPath = p1(parseName(oldPath, ptr + OLD_NAME.length, eol));
475 		if (oldPath == DEV_NULL)
476 			changeType = ChangeType.ADD;
477 	}
478 
479 	void parseNewName(int ptr, final int eol) {
480 		newPath = p1(parseName(newPath, ptr + NEW_NAME.length, eol));
481 		if (newPath == DEV_NULL)
482 			changeType = ChangeType.DELETE;
483 	}
484 
485 	void parseNewFileMode(int ptr, final int eol) {
486 		oldMode = FileMode.MISSING;
487 		newMode = parseFileMode(ptr + NEW_FILE_MODE.length, eol);
488 		changeType = ChangeType.ADD;
489 	}
490 
491 	int parseTraditionalHeaders(int ptr, final int end) {
492 		while (ptr < end) {
493 			final int eol = nextLF(buf, ptr);
494 			if (isHunkHdr(buf, ptr, eol) >= 1) {
495 				// First hunk header; break out and parse them later.
496 				break;
497 
498 			} else if (match(buf, ptr, OLD_NAME) >= 0) {
499 				parseOldName(ptr, eol);
500 
501 			} else if (match(buf, ptr, NEW_NAME) >= 0) {
502 				parseNewName(ptr, eol);
503 
504 			} else {
505 				// Possibly an empty patch.
506 				break;
507 			}
508 
509 			ptr = eol;
510 		}
511 		return ptr;
512 	}
513 
514 	private String parseName(final String expect, int ptr, final int end) {
515 		if (ptr == end)
516 			return expect;
517 
518 		String r;
519 		if (buf[ptr] == '"') {
520 			// New style GNU diff format
521 			//
522 			r = QuotedString.GIT_PATH.dequote(buf, ptr, end - 1);
523 		} else {
524 			// Older style GNU diff format, an optional tab ends the name.
525 			//
526 			int tab = end;
527 			while (ptr < tab && buf[tab - 1] != '\t')
528 				tab--;
529 			if (ptr == tab)
530 				tab = end;
531 			r = decode(Constants.CHARSET, buf, ptr, tab - 1);
532 		}
533 
534 		if (r.equals(DEV_NULL))
535 			r = DEV_NULL;
536 		return r;
537 	}
538 
539 	private static String p1(final String r) {
540 		final int s = r.indexOf('/');
541 		return s > 0 ? r.substring(s + 1) : r;
542 	}
543 
544 	FileMode parseFileMode(int ptr, final int end) {
545 		int tmp = 0;
546 		while (ptr < end - 1) {
547 			tmp <<= 3;
548 			tmp += buf[ptr++] - '0';
549 		}
550 		return FileMode.fromBits(tmp);
551 	}
552 
553 	void parseIndexLine(int ptr, final int end) {
554 		// "index $asha1..$bsha1[ $mode]" where $asha1 and $bsha1
555 		// can be unique abbreviations
556 		//
557 		final int dot2 = nextLF(buf, ptr, '.');
558 		final int mode = nextLF(buf, dot2, ' ');
559 
560 		oldId = AbbreviatedObjectId.fromString(buf, ptr, dot2 - 1);
561 		newId = AbbreviatedObjectId.fromString(buf, dot2 + 1, mode - 1);
562 
563 		if (mode < end)
564 			newMode = oldMode = parseFileMode(mode, end);
565 	}
566 
567 	private boolean eq(int aPtr, int aEnd, int bPtr, int bEnd) {
568 		if (aEnd - aPtr != bEnd - bPtr) {
569 			return false;
570 		}
571 		while (aPtr < aEnd) {
572 			if (buf[aPtr++] != buf[bPtr++])
573 				return false;
574 		}
575 		return true;
576 	}
577 
578 	/**
579 	 * Determine if this is a patch hunk header.
580 	 *
581 	 * @param buf
582 	 *            the buffer to scan
583 	 * @param start
584 	 *            first position in the buffer to evaluate
585 	 * @param end
586 	 *            last position to consider; usually the end of the buffer (
587 	 *            <code>buf.length</code>) or the first position on the next
588 	 *            line. This is only used to avoid very long runs of '@' from
589 	 *            killing the scan loop.
590 	 * @return the number of "ancestor revisions" in the hunk header. A
591 	 *         traditional two-way diff ("@@ -...") returns 1; a combined diff
592 	 *         for a 3 way-merge returns 3. If this is not a hunk header, 0 is
593 	 *         returned instead.
594 	 */
595 	static int isHunkHdr(final byte[] buf, final int start, final int end) {
596 		int ptr = start;
597 		while (ptr < end && buf[ptr] == '@')
598 			ptr++;
599 		if (ptr - start < 2)
600 			return 0;
601 		if (ptr == end || buf[ptr++] != ' ')
602 			return 0;
603 		if (ptr == end || buf[ptr++] != '-')
604 			return 0;
605 		return (ptr - 3) - start;
606 	}
607 }