View Javadoc
1   /*
2    * Copyright (C) 2009, Google Inc.
3    * Copyright (C) 2008-2021, Johannes E. Schindelin <johannes.schindelin@gmx.de> and others
4    *
5    * This program and the accompanying materials are made available under the
6    * terms of the Eclipse Distribution License v. 1.0 which is available at
7    * https://www.eclipse.org/org/documents/edl-v10.php.
8    *
9    * SPDX-License-Identifier: BSD-3-Clause
10   */
11  
12  package org.eclipse.jgit.diff;
13  
14  import java.io.EOFException;
15  import java.io.File;
16  import java.io.IOException;
17  import java.io.InputStream;
18  import java.io.OutputStream;
19  import java.nio.ByteBuffer;
20  import java.util.concurrent.atomic.AtomicInteger;
21  
22  import org.eclipse.jgit.errors.BinaryBlobException;
23  import org.eclipse.jgit.errors.LargeObjectException;
24  import org.eclipse.jgit.lib.ObjectLoader;
25  import org.eclipse.jgit.util.IO;
26  import org.eclipse.jgit.util.IntList;
27  import org.eclipse.jgit.util.RawParseUtils;
28  
29  /**
30   * A Sequence supporting UNIX formatted text in byte[] format.
31   * <p>
32   * Elements of the sequence are the lines of the file, as delimited by the UNIX
33   * newline character ('\n'). The file content is treated as 8 bit binary text,
34   * with no assumptions or requirements on character encoding.
35   * <p>
36   * Note that the first line of the file is element 0, as defined by the Sequence
37   * interface API. Traditionally in a text editor a patch file the first line is
38   * line number 1. Callers may need to subtract 1 prior to invoking methods if
39   * they are converting from "line number" to "element index".
40   */
41  public class RawText extends Sequence {
42  
43  	/** A RawText of length 0 */
44  	public static final RawText EMPTY_TEXT = new RawText(new byte[0]);
45  
46  	/**
47  	 * Default and minimum for {@link #BUFFER_SIZE}.
48  	 */
49  	private static final int FIRST_FEW_BYTES = 8 * 1024;
50  
51  	/**
52  	 * Number of bytes to check for heuristics in {@link #isBinary(byte[])}.
53  	 */
54  	private static final AtomicInteger BUFFER_SIZE = new AtomicInteger(
55  			FIRST_FEW_BYTES);
56  
57  	/** The file content for this sequence. */
58  	protected final byte[] content;
59  
60  	/** Map of line number to starting position within {@link #content}. */
61  	protected final IntList lines;
62  
63  	/**
64  	 * Create a new sequence from an existing content byte array.
65  	 * <p>
66  	 * The entire array (indexes 0 through length-1) is used as the content.
67  	 *
68  	 * @param input
69  	 *            the content array. The object retains a reference to this
70  	 *            array, so it should be immutable.
71  	 */
72  	public RawText(byte[] input) {
73  		this(input, RawParseUtils.lineMap(input, 0, input.length));
74  	}
75  
76  	/**
77  	 * Create a new sequence from the existing content byte array and the line
78  	 * map indicating line boundaries.
79  	 *
80  	 * @param input
81  	 *            the content array. The object retains a reference to this
82  	 *            array, so it should be immutable.
83  	 * @param lineMap
84  	 *            an array with 1-based offsets for the start of each line.
85  	 *            The first and last entries should be {@link Integer#MIN_VALUE}
86  	 *            and an offset one past the end of the last line, respectively.
87  	 * @since 5.0
88  	 */
89  	public RawText(byte[] input, IntList lineMap) {
90  		content = input;
91  		lines = lineMap;
92  	}
93  
94  	/**
95  	 * Create a new sequence from a file.
96  	 * <p>
97  	 * The entire file contents are used.
98  	 *
99  	 * @param file
100 	 *            the text file.
101 	 * @throws java.io.IOException
102 	 *             if Exceptions occur while reading the file
103 	 */
104 	public RawText(File file) throws IOException {
105 		this(IO.readFully(file));
106 	}
107 
108 	/**
109 	 * @return the raw, unprocessed content read.
110 	 * @since 4.11
111 	 */
112 	public byte[] getRawContent() {
113 		return content;
114 	}
115 
116 	/** @return total number of items in the sequence. */
117 	/** {@inheritDoc} */
118 	@Override
119 	public int size() {
120 		// The line map is always 2 entries larger than the number of lines in
121 		// the file. Index 0 is padded out/unused. The last index is the total
122 		// length of the buffer, and acts as a sentinel.
123 		//
124 		return lines.size() - 2;
125 	}
126 
127 	/**
128 	 * Write a specific line to the output stream, without its trailing LF.
129 	 * <p>
130 	 * The specified line is copied as-is, with no character encoding
131 	 * translation performed.
132 	 * <p>
133 	 * If the specified line ends with an LF ('\n'), the LF is <b>not</b>
134 	 * copied. It is up to the caller to write the LF, if desired, between
135 	 * output lines.
136 	 *
137 	 * @param out
138 	 *            stream to copy the line data onto.
139 	 * @param i
140 	 *            index of the line to extract. Note this is 0-based, so line
141 	 *            number 1 is actually index 0.
142 	 * @throws java.io.IOException
143 	 *             the stream write operation failed.
144 	 */
145 	public void writeLine(OutputStream out, int i)
146 			throws IOException {
147 		int start = getStart(i);
148 		int end = getEnd(i);
149 		if (content[end - 1] == '\n')
150 			end--;
151 		out.write(content, start, end - start);
152 	}
153 
154 	/**
155 	 * Determine if the file ends with a LF ('\n').
156 	 *
157 	 * @return true if the last line has an LF; false otherwise.
158 	 */
159 	public boolean isMissingNewlineAtEnd() {
160 		final int end = lines.get(lines.size() - 1);
161 		if (end == 0)
162 			return true;
163 		return content[end - 1] != '\n';
164 	}
165 
166 	/**
167 	 * Get the text for a single line.
168 	 *
169 	 * @param i
170 	 *            index of the line to extract. Note this is 0-based, so line
171 	 *            number 1 is actually index 0.
172 	 * @return the text for the line, without a trailing LF.
173 	 */
174 	public String getString(int i) {
175 		return getString(i, i + 1, true);
176 	}
177 
178 	/**
179 	 * Get the raw text for a single line.
180 	 *
181 	 * @param i
182 	 *            index of the line to extract. Note this is 0-based, so line
183 	 *            number 1 is actually index 0.
184 	 * @return the text for the line, without a trailing LF, as a
185 	 *         {@link ByteBuffer} that is backed by a slice of the
186 	 *         {@link #getRawContent() raw content}, with the buffer's position
187 	 *         on the start of the line and the limit at the end.
188 	 * @since 5.12
189 	 */
190 	public ByteBuffer getRawString(int i) {
191 		int s = getStart(i);
192 		int e = getEnd(i);
193 		if (e > 0 && content[e - 1] == '\n') {
194 			e--;
195 		}
196 		return ByteBuffer.wrap(content, s, e - s);
197 	}
198 
199 	/**
200 	 * Get the text for a region of lines.
201 	 *
202 	 * @param begin
203 	 *            index of the first line to extract. Note this is 0-based, so
204 	 *            line number 1 is actually index 0.
205 	 * @param end
206 	 *            index of one past the last line to extract.
207 	 * @param dropLF
208 	 *            if true the trailing LF ('\n') of the last returned line is
209 	 *            dropped, if present.
210 	 * @return the text for lines {@code [begin, end)}.
211 	 */
212 	public String getString(int begin, int end, boolean dropLF) {
213 		if (begin == end)
214 			return ""; //$NON-NLS-1$
215 
216 		int s = getStart(begin);
217 		int e = getEnd(end - 1);
218 		if (dropLF && content[e - 1] == '\n')
219 			e--;
220 		return decode(s, e);
221 	}
222 
223 	/**
224 	 * Decode a region of the text into a String.
225 	 *
226 	 * The default implementation of this method tries to guess the character
227 	 * set by considering UTF-8, the platform default, and falling back on
228 	 * ISO-8859-1 if neither of those can correctly decode the region given.
229 	 *
230 	 * @param start
231 	 *            first byte of the content to decode.
232 	 * @param end
233 	 *            one past the last byte of the content to decode.
234 	 * @return the region {@code [start, end)} decoded as a String.
235 	 */
236 	protected String decode(int start, int end) {
237 		return RawParseUtils.decode(content, start, end);
238 	}
239 
240 	private int getStart(int i) {
241 		return lines.get(i + 1);
242 	}
243 
244 	private int getEnd(int i) {
245 		return lines.get(i + 2);
246 	}
247 
248 	/**
249 	 * Obtains the buffer size to use for analyzing whether certain content is
250 	 * text or binary, or what line endings are used if it's text.
251 	 *
252 	 * @return the buffer size, by default {@link #FIRST_FEW_BYTES} bytes
253 	 * @since 6.0
254 	 */
255 	public static int getBufferSize() {
256 		return BUFFER_SIZE.get();
257 	}
258 
259 	/**
260 	 * Sets the buffer size to use for analyzing whether certain content is text
261 	 * or binary, or what line endings are used if it's text. If the given
262 	 * {@code bufferSize} is smaller than {@link #FIRST_FEW_BYTES} set the
263 	 * buffer size to {@link #FIRST_FEW_BYTES}.
264 	 *
265 	 * @param bufferSize
266 	 *            Size to set
267 	 * @return the size actually set
268 	 * @since 6.0
269 	 */
270 	public static int setBufferSize(int bufferSize) {
271 		int newSize = Math.max(FIRST_FEW_BYTES, bufferSize);
272 		return BUFFER_SIZE.updateAndGet(curr -> newSize);
273 	}
274 
275 	/**
276 	 * Determine heuristically whether the bytes contained in a stream
277 	 * represents binary (as opposed to text) content.
278 	 *
279 	 * Note: Do not further use this stream after having called this method! The
280 	 * stream may not be fully read and will be left at an unknown position
281 	 * after consuming an unknown number of bytes. The caller is responsible for
282 	 * closing the stream.
283 	 *
284 	 * @param raw
285 	 *            input stream containing the raw file content.
286 	 * @return true if raw is likely to be a binary file, false otherwise
287 	 * @throws java.io.IOException
288 	 *             if input stream could not be read
289 	 */
290 	public static boolean isBinary(InputStream raw) throws IOException {
291 		final byte[] buffer = new byte[getBufferSize()];
292 		int cnt = 0;
293 		while (cnt < buffer.length) {
294 			final int n = raw.read(buffer, cnt, buffer.length - cnt);
295 			if (n == -1)
296 				break;
297 			cnt += n;
298 		}
299 		return isBinary(buffer, cnt, cnt < buffer.length);
300 	}
301 
302 	/**
303 	 * Determine heuristically whether a byte array represents binary (as
304 	 * opposed to text) content.
305 	 *
306 	 * @param raw
307 	 *            the raw file content.
308 	 * @return true if raw is likely to be a binary file, false otherwise
309 	 */
310 	public static boolean isBinary(byte[] raw) {
311 		return isBinary(raw, raw.length);
312 	}
313 
314 	/**
315 	 * Determine heuristically whether a byte array represents binary (as
316 	 * opposed to text) content.
317 	 *
318 	 * @param raw
319 	 *            the raw file content.
320 	 * @param length
321 	 *            number of bytes in {@code raw} to evaluate. This should be
322 	 *            {@code raw.length} unless {@code raw} was over-allocated by
323 	 *            the caller.
324 	 * @return true if raw is likely to be a binary file, false otherwise
325 	 */
326 	public static boolean isBinary(byte[] raw, int length) {
327 		return isBinary(raw, length, false);
328 	}
329 
330 	/**
331 	 * Determine heuristically whether a byte array represents binary (as
332 	 * opposed to text) content.
333 	 *
334 	 * @param raw
335 	 *            the raw file content.
336 	 * @param length
337 	 *            number of bytes in {@code raw} to evaluate. This should be
338 	 *            {@code raw.length} unless {@code raw} was over-allocated by
339 	 *            the caller.
340 	 * @param complete
341 	 *            whether {@code raw} contains the whole data
342 	 * @return true if raw is likely to be a binary file, false otherwise
343 	 * @since 6.0
344 	 */
345 	public static boolean isBinary(byte[] raw, int length, boolean complete) {
346 		// Similar heuristic as C Git. Differences:
347 		// - limited buffer size; may be only the beginning of a large blob
348 		// - no counting of printable vs. non-printable bytes < 0x20 and 0x7F
349 		int maxLength = getBufferSize();
350 		if (length > maxLength) {
351 			length = maxLength;
352 		}
353 		byte last = 'x'; // Just something inconspicuous.
354 		for (int ptr = 0; ptr < length; ptr++) {
355 			byte curr = raw[ptr];
356 			if (isBinary(curr, last)) {
357 				return true;
358 			}
359 			last = curr;
360 		}
361 		if (complete) {
362 			// Buffer contains everything...
363 			return last == '\r'; // ... so this must be a lone CR
364 		}
365 		return false;
366 	}
367 
368 	/**
369 	 * Determines from the last two bytes read from a source if it looks like
370 	 * binary content.
371 	 *
372 	 * @param curr
373 	 *            the last byte, read after {@code prev}
374 	 * @param prev
375 	 *            the previous byte, read before {@code last}
376 	 * @return {@code true} if either byte is NUL, or if prev is CR and curr is
377 	 *         not LF, {@code false} otherwise
378 	 * @since 6.0
379 	 */
380 	public static boolean isBinary(byte curr, byte prev) {
381 		return curr == '\0' || (curr != '\n' && prev == '\r') || prev == '\0';
382 	}
383 
384 	/**
385 	 * Determine heuristically whether a byte array represents text content
386 	 * using CR-LF as line separator.
387 	 *
388 	 * @param raw
389 	 *            the raw file content.
390 	 * @return {@code true} if raw is likely to be CR-LF delimited text,
391 	 *         {@code false} otherwise
392 	 * @since 5.3
393 	 */
394 	public static boolean isCrLfText(byte[] raw) {
395 		return isCrLfText(raw, raw.length);
396 	}
397 
398 	/**
399 	 * Determine heuristically whether the bytes contained in a stream represent
400 	 * text content using CR-LF as line separator.
401 	 *
402 	 * Note: Do not further use this stream after having called this method! The
403 	 * stream may not be fully read and will be left at an unknown position
404 	 * after consuming an unknown number of bytes. The caller is responsible for
405 	 * closing the stream.
406 	 *
407 	 * @param raw
408 	 *            input stream containing the raw file content.
409 	 * @return {@code true} if raw is likely to be CR-LF delimited text,
410 	 *         {@code false} otherwise
411 	 * @throws java.io.IOException
412 	 *             if input stream could not be read
413 	 * @since 5.3
414 	 */
415 	public static boolean isCrLfText(InputStream raw) throws IOException {
416 		byte[] buffer = new byte[getBufferSize()];
417 		int cnt = 0;
418 		while (cnt < buffer.length) {
419 			int n = raw.read(buffer, cnt, buffer.length - cnt);
420 			if (n == -1) {
421 				break;
422 			}
423 			cnt += n;
424 		}
425 		return isCrLfText(buffer, cnt);
426 	}
427 
428 	/**
429 	 * Determine heuristically whether a byte array represents text content
430 	 * using CR-LF as line separator.
431 	 *
432 	 * @param raw
433 	 *            the raw file content.
434 	 * @param length
435 	 *            number of bytes in {@code raw} to evaluate.
436 	 * @return {@code true} if raw is likely to be CR-LF delimited text,
437 	 *         {@code false} otherwise
438 	 * @since 5.3
439 	 */
440 	public static boolean isCrLfText(byte[] raw, int length) {
441 		return isCrLfText(raw, length, false);
442 	}
443 
444 	/**
445 	 * Determine heuristically whether a byte array represents text content
446 	 * using CR-LF as line separator.
447 	 *
448 	 * @param raw
449 	 *            the raw file content.
450 	 * @param length
451 	 *            number of bytes in {@code raw} to evaluate.
452 	 * @return {@code true} if raw is likely to be CR-LF delimited text,
453 	 *         {@code false} otherwise
454 	 * @param complete
455 	 *            whether {@code raw} contains the whole data
456 	 * @since 6.0
457 	 */
458 	public static boolean isCrLfText(byte[] raw, int length, boolean complete) {
459 		boolean has_crlf = false;
460 		byte last = 'x'; // Just something inconspicuous
461 		for (int ptr = 0; ptr < length; ptr++) {
462 			byte curr = raw[ptr];
463 			if (isBinary(curr, last)) {
464 				return false;
465 			}
466 			if (curr == '\n' && last == '\r') {
467 				has_crlf = true;
468 			}
469 			last = curr;
470 		}
471 		if (last == '\r') {
472 			if (complete) {
473 				// Lone CR: it's binary after all.
474 				return false;
475 			}
476 			// Tough call. If the next byte, which we don't have, would be a
477 			// '\n', it'd be a CR-LF text, otherwise it'd be binary. Just decide
478 			// based on what we already scanned; it wasn't binary until now.
479 		}
480 		return has_crlf;
481 	}
482 
483 	/**
484 	 * Get the line delimiter for the first line.
485 	 *
486 	 * @since 2.0
487 	 * @return the line delimiter or <code>null</code>
488 	 */
489 	public String getLineDelimiter() {
490 		if (size() == 0) {
491 			return null;
492 		}
493 		int e = getEnd(0);
494 		if (content[e - 1] != '\n') {
495 			return null;
496 		}
497 		if (content.length > 1 && e > 1 && content[e - 2] == '\r') {
498 			return "\r\n"; //$NON-NLS-1$
499 		}
500 		return "\n"; //$NON-NLS-1$
501 	}
502 
503 	/**
504 	 * Read a blob object into RawText, or throw BinaryBlobException if the blob
505 	 * is binary.
506 	 *
507 	 * @param ldr
508 	 *            the ObjectLoader for the blob
509 	 * @param threshold
510 	 *            if the blob is larger than this size, it is always assumed to
511 	 *            be binary.
512 	 * @since 4.10
513 	 * @return the RawText representing the blob.
514 	 * @throws org.eclipse.jgit.errors.BinaryBlobException
515 	 *             if the blob contains binary data.
516 	 * @throws java.io.IOException
517 	 *             if the input could not be read.
518 	 */
519 	public static RawText load(ObjectLoader ldr, int threshold)
520 			throws IOException, BinaryBlobException {
521 		long sz = ldr.getSize();
522 
523 		if (sz > threshold) {
524 			throw new BinaryBlobException();
525 		}
526 
527 		int bufferSize = getBufferSize();
528 		if (sz <= bufferSize) {
529 			byte[] data = ldr.getCachedBytes(bufferSize);
530 			if (isBinary(data, data.length, true)) {
531 				throw new BinaryBlobException();
532 			}
533 			return new RawText(data);
534 		}
535 
536 		byte[] head = new byte[bufferSize];
537 		try (InputStream stream = ldr.openStream()) {
538 			int off = 0;
539 			int left = head.length;
540 			byte last = 'x'; // Just something inconspicuous
541 			while (left > 0) {
542 				int n = stream.read(head, off, left);
543 				if (n < 0) {
544 					throw new EOFException();
545 				}
546 				left -= n;
547 
548 				while (n > 0) {
549 					byte curr = head[off];
550 					if (isBinary(curr, last)) {
551 						throw new BinaryBlobException();
552 					}
553 					last = curr;
554 					off++;
555 					n--;
556 				}
557 			}
558 
559 			byte[] data;
560 			try {
561 				data = new byte[(int)sz];
562 			} catch (OutOfMemoryError e) {
563 				throw new LargeObjectException.OutOfMemory(e);
564 			}
565 
566 			System.arraycopy(head, 0, data, 0, head.length);
567 			IO.readFully(stream, data, off, (int) (sz-off));
568 			return new RawText(data, RawParseUtils.lineMapOrBinary(data, 0, (int) sz));
569 		}
570 	}
571 }