View Javadoc
1   /*
2    * Copyright (C) 2009, Google Inc.
3    * Copyright (C) 2008-2021, Johannes E. Schindelin <johannes.schindelin@gmx.de> and others
4    *
5    * This program and the accompanying materials are made available under the
6    * terms of the Eclipse Distribution License v. 1.0 which is available at
7    * https://www.eclipse.org/org/documents/edl-v10.php.
8    *
9    * SPDX-License-Identifier: BSD-3-Clause
10   */
11  
12  package org.eclipse.jgit.diff;
13  
14  import java.io.EOFException;
15  import java.io.File;
16  import java.io.IOException;
17  import java.io.InputStream;
18  import java.io.OutputStream;
19  import java.nio.ByteBuffer;
20  
21  import org.eclipse.jgit.errors.BinaryBlobException;
22  import org.eclipse.jgit.errors.LargeObjectException;
23  import org.eclipse.jgit.lib.ObjectLoader;
24  import org.eclipse.jgit.util.IO;
25  import org.eclipse.jgit.util.IntList;
26  import org.eclipse.jgit.util.RawParseUtils;
27  
28  /**
29   * A Sequence supporting UNIX formatted text in byte[] format.
30   * <p>
31   * Elements of the sequence are the lines of the file, as delimited by the UNIX
32   * newline character ('\n'). The file content is treated as 8 bit binary text,
33   * with no assumptions or requirements on character encoding.
34   * <p>
35   * Note that the first line of the file is element 0, as defined by the Sequence
36   * interface API. Traditionally in a text editor a patch file the first line is
37   * line number 1. Callers may need to subtract 1 prior to invoking methods if
38   * they are converting from "line number" to "element index".
39   */
40  public class RawText extends Sequence {
41  	/** A RawText of length 0 */
42  	public static final RawText EMPTY_TEXT = new RawText(new byte[0]);
43  
44  	/** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */
45  	static final int FIRST_FEW_BYTES = 8000;
46  
47  	/** The file content for this sequence. */
48  	protected final byte[] content;
49  
50  	/** Map of line number to starting position within {@link #content}. */
51  	protected final IntList lines;
52  
53  	/**
54  	 * Create a new sequence from an existing content byte array.
55  	 * <p>
56  	 * The entire array (indexes 0 through length-1) is used as the content.
57  	 *
58  	 * @param input
59  	 *            the content array. The object retains a reference to this
60  	 *            array, so it should be immutable.
61  	 */
62  	public RawText(byte[] input) {
63  		this(input, RawParseUtils.lineMap(input, 0, input.length));
64  	}
65  
66  	/**
67  	 * Create a new sequence from the existing content byte array and the line
68  	 * map indicating line boundaries.
69  	 *
70  	 * @param input
71  	 *            the content array. The object retains a reference to this
72  	 *            array, so it should be immutable.
73  	 * @param lineMap
74  	 *            an array with 1-based offsets for the start of each line.
75  	 *            The first and last entries should be {@link Integer#MIN_VALUE}
76  	 *            and an offset one past the end of the last line, respectively.
77  	 * @since 5.0
78  	 */
79  	public RawText(byte[] input, IntList lineMap) {
80  		content = input;
81  		lines = lineMap;
82  	}
83  
84  	/**
85  	 * Create a new sequence from a file.
86  	 * <p>
87  	 * The entire file contents are used.
88  	 *
89  	 * @param file
90  	 *            the text file.
91  	 * @throws java.io.IOException
92  	 *             if Exceptions occur while reading the file
93  	 */
94  	public RawText(File file) throws IOException {
95  		this(IO.readFully(file));
96  	}
97  
98  	/**
99  	 * @return the raw, unprocessed content read.
100 	 * @since 4.11
101 	 */
102 	public byte[] getRawContent() {
103 		return content;
104 	}
105 
106 	/** @return total number of items in the sequence. */
107 	/** {@inheritDoc} */
108 	@Override
109 	public int size() {
110 		// The line map is always 2 entries larger than the number of lines in
111 		// the file. Index 0 is padded out/unused. The last index is the total
112 		// length of the buffer, and acts as a sentinel.
113 		//
114 		return lines.size() - 2;
115 	}
116 
117 	/**
118 	 * Write a specific line to the output stream, without its trailing LF.
119 	 * <p>
120 	 * The specified line is copied as-is, with no character encoding
121 	 * translation performed.
122 	 * <p>
123 	 * If the specified line ends with an LF ('\n'), the LF is <b>not</b>
124 	 * copied. It is up to the caller to write the LF, if desired, between
125 	 * output lines.
126 	 *
127 	 * @param out
128 	 *            stream to copy the line data onto.
129 	 * @param i
130 	 *            index of the line to extract. Note this is 0-based, so line
131 	 *            number 1 is actually index 0.
132 	 * @throws java.io.IOException
133 	 *             the stream write operation failed.
134 	 */
135 	public void writeLine(OutputStream out, int i)
136 			throws IOException {
137 		int start = getStart(i);
138 		int end = getEnd(i);
139 		if (content[end - 1] == '\n')
140 			end--;
141 		out.write(content, start, end - start);
142 	}
143 
144 	/**
145 	 * Determine if the file ends with a LF ('\n').
146 	 *
147 	 * @return true if the last line has an LF; false otherwise.
148 	 */
149 	public boolean isMissingNewlineAtEnd() {
150 		final int end = lines.get(lines.size() - 1);
151 		if (end == 0)
152 			return true;
153 		return content[end - 1] != '\n';
154 	}
155 
156 	/**
157 	 * Get the text for a single line.
158 	 *
159 	 * @param i
160 	 *            index of the line to extract. Note this is 0-based, so line
161 	 *            number 1 is actually index 0.
162 	 * @return the text for the line, without a trailing LF.
163 	 */
164 	public String getString(int i) {
165 		return getString(i, i + 1, true);
166 	}
167 
168 	/**
169 	 * Get the raw text for a single line.
170 	 *
171 	 * @param i
172 	 *            index of the line to extract. Note this is 0-based, so line
173 	 *            number 1 is actually index 0.
174 	 * @return the text for the line, without a trailing LF, as a
175 	 *         {@link ByteBuffer} that is backed by a slice of the
176 	 *         {@link #getRawContent() raw content}, with the buffer's position
177 	 *         on the start of the line and the limit at the end.
178 	 * @since 5.12
179 	 */
180 	public ByteBuffer getRawString(int i) {
181 		int s = getStart(i);
182 		int e = getEnd(i);
183 		if (e > 0 && content[e - 1] == '\n') {
184 			e--;
185 		}
186 		return ByteBuffer.wrap(content, s, e - s);
187 	}
188 
189 	/**
190 	 * Get the text for a region of lines.
191 	 *
192 	 * @param begin
193 	 *            index of the first line to extract. Note this is 0-based, so
194 	 *            line number 1 is actually index 0.
195 	 * @param end
196 	 *            index of one past the last line to extract.
197 	 * @param dropLF
198 	 *            if true the trailing LF ('\n') of the last returned line is
199 	 *            dropped, if present.
200 	 * @return the text for lines {@code [begin, end)}.
201 	 */
202 	public String getString(int begin, int end, boolean dropLF) {
203 		if (begin == end)
204 			return ""; //$NON-NLS-1$
205 
206 		int s = getStart(begin);
207 		int e = getEnd(end - 1);
208 		if (dropLF && content[e - 1] == '\n')
209 			e--;
210 		return decode(s, e);
211 	}
212 
213 	/**
214 	 * Decode a region of the text into a String.
215 	 *
216 	 * The default implementation of this method tries to guess the character
217 	 * set by considering UTF-8, the platform default, and falling back on
218 	 * ISO-8859-1 if neither of those can correctly decode the region given.
219 	 *
220 	 * @param start
221 	 *            first byte of the content to decode.
222 	 * @param end
223 	 *            one past the last byte of the content to decode.
224 	 * @return the region {@code [start, end)} decoded as a String.
225 	 */
226 	protected String decode(int start, int end) {
227 		return RawParseUtils.decode(content, start, end);
228 	}
229 
230 	private int getStart(int i) {
231 		return lines.get(i + 1);
232 	}
233 
234 	private int getEnd(int i) {
235 		return lines.get(i + 2);
236 	}
237 
238 	/**
239 	 * Determine heuristically whether a byte array represents binary (as
240 	 * opposed to text) content.
241 	 *
242 	 * @param raw
243 	 *            the raw file content.
244 	 * @return true if raw is likely to be a binary file, false otherwise
245 	 */
246 	public static boolean isBinary(byte[] raw) {
247 		return isBinary(raw, raw.length);
248 	}
249 
250 	/**
251 	 * Determine heuristically whether the bytes contained in a stream
252 	 * represents binary (as opposed to text) content.
253 	 *
254 	 * Note: Do not further use this stream after having called this method! The
255 	 * stream may not be fully read and will be left at an unknown position
256 	 * after consuming an unknown number of bytes. The caller is responsible for
257 	 * closing the stream.
258 	 *
259 	 * @param raw
260 	 *            input stream containing the raw file content.
261 	 * @return true if raw is likely to be a binary file, false otherwise
262 	 * @throws java.io.IOException
263 	 *             if input stream could not be read
264 	 */
265 	public static boolean isBinary(InputStream raw) throws IOException {
266 		final byte[] buffer = new byte[FIRST_FEW_BYTES];
267 		int cnt = 0;
268 		while (cnt < buffer.length) {
269 			final int n = raw.read(buffer, cnt, buffer.length - cnt);
270 			if (n == -1)
271 				break;
272 			cnt += n;
273 		}
274 		return isBinary(buffer, cnt);
275 	}
276 
277 	/**
278 	 * Determine heuristically whether a byte array represents binary (as
279 	 * opposed to text) content.
280 	 *
281 	 * @param raw
282 	 *            the raw file content.
283 	 * @param length
284 	 *            number of bytes in {@code raw} to evaluate. This should be
285 	 *            {@code raw.length} unless {@code raw} was over-allocated by
286 	 *            the caller.
287 	 * @return true if raw is likely to be a binary file, false otherwise
288 	 */
289 	public static boolean isBinary(byte[] raw, int length) {
290 		// Same heuristic as C Git
291 		if (length > FIRST_FEW_BYTES)
292 			length = FIRST_FEW_BYTES;
293 		for (int ptr = 0; ptr < length; ptr++)
294 			if (raw[ptr] == '\0')
295 				return true;
296 
297 		return false;
298 	}
299 
300 	/**
301 	 * Determine heuristically whether a byte array represents text content
302 	 * using CR-LF as line separator.
303 	 *
304 	 * @param raw
305 	 *            the raw file content.
306 	 * @return {@code true} if raw is likely to be CR-LF delimited text,
307 	 *         {@code false} otherwise
308 	 * @since 5.3
309 	 */
310 	public static boolean isCrLfText(byte[] raw) {
311 		return isCrLfText(raw, raw.length);
312 	}
313 
314 	/**
315 	 * Determine heuristically whether the bytes contained in a stream represent
316 	 * text content using CR-LF as line separator.
317 	 *
318 	 * Note: Do not further use this stream after having called this method! The
319 	 * stream may not be fully read and will be left at an unknown position
320 	 * after consuming an unknown number of bytes. The caller is responsible for
321 	 * closing the stream.
322 	 *
323 	 * @param raw
324 	 *            input stream containing the raw file content.
325 	 * @return {@code true} if raw is likely to be CR-LF delimited text,
326 	 *         {@code false} otherwise
327 	 * @throws java.io.IOException
328 	 *             if input stream could not be read
329 	 * @since 5.3
330 	 */
331 	public static boolean isCrLfText(InputStream raw) throws IOException {
332 		byte[] buffer = new byte[FIRST_FEW_BYTES];
333 		int cnt = 0;
334 		while (cnt < buffer.length) {
335 			int n = raw.read(buffer, cnt, buffer.length - cnt);
336 			if (n == -1) {
337 				break;
338 			}
339 			cnt += n;
340 		}
341 		return isCrLfText(buffer, cnt);
342 	}
343 
344 	/**
345 	 * Determine heuristically whether a byte array represents text content
346 	 * using CR-LF as line separator.
347 	 *
348 	 * @param raw
349 	 *            the raw file content.
350 	 * @param length
351 	 *            number of bytes in {@code raw} to evaluate.
352 	 * @return {@code true} if raw is likely to be CR-LF delimited text,
353 	 *         {@code false} otherwise
354 	 * @since 5.3
355 	 */
356 	public static boolean isCrLfText(byte[] raw, int length) {
357 		boolean has_crlf = false;
358 		for (int ptr = 0; ptr < length - 1; ptr++) {
359 			if (raw[ptr] == '\0') {
360 				return false; // binary
361 			} else if (raw[ptr] == '\r' && raw[ptr + 1] == '\n') {
362 				has_crlf = true;
363 			}
364 		}
365 		return has_crlf;
366 	}
367 
368 	/**
369 	 * Get the line delimiter for the first line.
370 	 *
371 	 * @since 2.0
372 	 * @return the line delimiter or <code>null</code>
373 	 */
374 	public String getLineDelimiter() {
375 		if (size() == 0) {
376 			return null;
377 		}
378 		int e = getEnd(0);
379 		if (content[e - 1] != '\n') {
380 			return null;
381 		}
382 		if (content.length > 1 && e > 1 && content[e - 2] == '\r') {
383 			return "\r\n"; //$NON-NLS-1$
384 		}
385 		return "\n"; //$NON-NLS-1$
386 	}
387 
388 	/**
389 	 * Read a blob object into RawText, or throw BinaryBlobException if the blob
390 	 * is binary.
391 	 *
392 	 * @param ldr
393 	 *            the ObjectLoader for the blob
394 	 * @param threshold
395 	 *            if the blob is larger than this size, it is always assumed to
396 	 *            be binary.
397 	 * @since 4.10
398 	 * @return the RawText representing the blob.
399 	 * @throws org.eclipse.jgit.errors.BinaryBlobException
400 	 *             if the blob contains binary data.
401 	 * @throws java.io.IOException
402 	 *             if the input could not be read.
403 	 */
404 	public static RawText load(ObjectLoader ldr, int threshold)
405 			throws IOException, BinaryBlobException {
406 		long sz = ldr.getSize();
407 
408 		if (sz > threshold) {
409 			throw new BinaryBlobException();
410 		}
411 
412 		if (sz <= FIRST_FEW_BYTES) {
413 			byte[] data = ldr.getCachedBytes(FIRST_FEW_BYTES);
414 			if (isBinary(data)) {
415 				throw new BinaryBlobException();
416 			}
417 			return new RawText(data);
418 		}
419 
420 		byte[] head = new byte[FIRST_FEW_BYTES];
421 		try (InputStream stream = ldr.openStream()) {
422 			int off = 0;
423 			int left = head.length;
424 			while (left > 0) {
425 				int n = stream.read(head, off, left);
426 				if (n < 0) {
427 					throw new EOFException();
428 				}
429 				left -= n;
430 
431 				while (n > 0) {
432 					if (head[off] == '\0') {
433 						throw new BinaryBlobException();
434 					}
435 					off++;
436 					n--;
437 				}
438 			}
439 
440 			byte[] data;
441 			try {
442 				data = new byte[(int)sz];
443 			} catch (OutOfMemoryError e) {
444 				throw new LargeObjectException.OutOfMemory(e);
445 			}
446 
447 			System.arraycopy(head, 0, data, 0, head.length);
448 			IO.readFully(stream, data, off, (int) (sz-off));
449 			return new RawText(data, RawParseUtils.lineMapOrBinary(data, 0, (int) sz));
450 		}
451 	}
452 }