View Javadoc
1   /*
2    * Copyright (C) 2008-2009, Google Inc.
3    * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org> and others
4    *
5    * This program and the accompanying materials are made available under the
6    * terms of the Eclipse Distribution License v. 1.0 which is available at
7    * https://www.eclipse.org/org/documents/edl-v10.php.
8    *
9    * SPDX-License-Identifier: BSD-3-Clause
10   */
11  
12  package org.eclipse.jgit.util;
13  
14  import static java.nio.charset.StandardCharsets.ISO_8859_1;
15  import static java.nio.charset.StandardCharsets.UTF_8;
16  import static org.eclipse.jgit.lib.ObjectChecker.author;
17  import static org.eclipse.jgit.lib.ObjectChecker.committer;
18  import static org.eclipse.jgit.lib.ObjectChecker.encoding;
19  import static org.eclipse.jgit.lib.ObjectChecker.tagger;
20  
21  import java.nio.ByteBuffer;
22  import java.nio.charset.CharacterCodingException;
23  import java.nio.charset.Charset;
24  import java.nio.charset.CharsetDecoder;
25  import java.nio.charset.CodingErrorAction;
26  import java.nio.charset.IllegalCharsetNameException;
27  import java.nio.charset.UnsupportedCharsetException;
28  import java.util.Arrays;
29  import java.util.HashMap;
30  import java.util.Map;
31  
32  import org.eclipse.jgit.annotations.Nullable;
33  import org.eclipse.jgit.diff.RawText;
34  import org.eclipse.jgit.errors.BinaryBlobException;
35  import org.eclipse.jgit.lib.Constants;
36  import org.eclipse.jgit.lib.PersonIdent;
37  
38  /**
39   * Handy utility functions to parse raw object contents.
40   */
41  public final class RawParseUtils {
42  	/**
43  	 * UTF-8 charset constant.
44  	 *
45  	 * @since 2.2
46  	 * @deprecated use {@link java.nio.charset.StandardCharsets#UTF_8} instead
47  	 */
48  	@Deprecated
49  	public static final Charset UTF8_CHARSET = UTF_8;
50  
51  	private static final byte[] digits10;
52  
53  	private static final byte[] digits16;
54  
55  	private static final byte[] footerLineKeyChars;
56  
57  	private static final Map<String, Charset> encodingAliases;
58  
59  	static {
60  		encodingAliases = new HashMap<>();
61  		encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
62  		encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
63  
64  		digits10 = new byte['9' + 1];
65  		Arrays.fill(digits10, (byte) -1);
66  		for (char i = '0'; i <= '9'; i++)
67  			digits10[i] = (byte) (i - '0');
68  
69  		digits16 = new byte['f' + 1];
70  		Arrays.fill(digits16, (byte) -1);
71  		for (char i = '0'; i <= '9'; i++)
72  			digits16[i] = (byte) (i - '0');
73  		for (char i = 'a'; i <= 'f'; i++)
74  			digits16[i] = (byte) ((i - 'a') + 10);
75  		for (char i = 'A'; i <= 'F'; i++)
76  			digits16[i] = (byte) ((i - 'A') + 10);
77  
78  		footerLineKeyChars = new byte['z' + 1];
79  		footerLineKeyChars['-'] = 1;
80  		for (char i = '0'; i <= '9'; i++)
81  			footerLineKeyChars[i] = 1;
82  		for (char i = 'A'; i <= 'Z'; i++)
83  			footerLineKeyChars[i] = 1;
84  		for (char i = 'a'; i <= 'z'; i++)
85  			footerLineKeyChars[i] = 1;
86  	}
87  
88  	/**
89  	 * Determine if b[ptr] matches src.
90  	 *
91  	 * @param b
92  	 *            the buffer to scan.
93  	 * @param ptr
94  	 *            first position within b, this should match src[0].
95  	 * @param src
96  	 *            the buffer to test for equality with b.
97  	 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
98  	 */
99  	public static final int match(byte[] b, int ptr, byte[] src) {
100 		if (ptr + src.length > b.length)
101 			return -1;
102 		for (int i = 0; i < src.length; i++, ptr++)
103 			if (b[ptr] != src[i])
104 				return -1;
105 		return ptr;
106 	}
107 
108 	private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
109 			'6', '7', '8', '9' };
110 
111 	/**
112 	 * Format a base 10 numeric into a temporary buffer.
113 	 * <p>
114 	 * Formatting is performed backwards. The method starts at offset
115 	 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
116 	 * <code>digits</code> is the number of positions necessary to store the
117 	 * base 10 value.
118 	 * <p>
119 	 * The argument and return values from this method make it easy to chain
120 	 * writing, for example:
121 	 * </p>
122 	 *
123 	 * <pre>
124 	 * final byte[] tmp = new byte[64];
125 	 * int ptr = tmp.length;
126 	 * tmp[--ptr] = '\n';
127 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
128 	 * tmp[--ptr] = ' ';
129 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
130 	 * tmp[--ptr] = 0;
131 	 * final String str = new String(tmp, ptr, tmp.length - ptr);
132 	 * </pre>
133 	 *
134 	 * @param b
135 	 *            buffer to write into.
136 	 * @param o
137 	 *            one offset past the location where writing will begin; writing
138 	 *            proceeds towards lower index values.
139 	 * @param value
140 	 *            the value to store.
141 	 * @return the new offset value <code>o</code>. This is the position of
142 	 *         the last byte written. Additional writing should start at one
143 	 *         position earlier.
144 	 */
145 	public static int formatBase10(final byte[] b, int o, int value) {
146 		if (value == 0) {
147 			b[--o] = '0';
148 			return o;
149 		}
150 		final boolean isneg = value < 0;
151 		if (isneg)
152 			value = -value;
153 		while (value != 0) {
154 			b[--o] = base10byte[value % 10];
155 			value /= 10;
156 		}
157 		if (isneg)
158 			b[--o] = '-';
159 		return o;
160 	}
161 
162 	/**
163 	 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
164 	 * <p>
165 	 * Digit sequences can begin with an optional run of spaces before the
166 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
167 	 * Any other characters will cause the method to stop and return the current
168 	 * result to the caller.
169 	 *
170 	 * @param b
171 	 *            buffer to scan.
172 	 * @param ptr
173 	 *            position within buffer to start parsing digits at.
174 	 * @param ptrResult
175 	 *            optional location to return the new ptr value through. If null
176 	 *            the ptr value will be discarded.
177 	 * @return the value at this location; 0 if the location is not a valid
178 	 *         numeric.
179 	 */
180 	public static final int parseBase10(final byte[] b, int ptr,
181 			final MutableInteger ptrResult) {
182 		int r = 0;
183 		int sign = 0;
184 		try {
185 			final int sz = b.length;
186 			while (ptr < sz && b[ptr] == ' ')
187 				ptr++;
188 			if (ptr >= sz)
189 				return 0;
190 
191 			switch (b[ptr]) {
192 			case '-':
193 				sign = -1;
194 				ptr++;
195 				break;
196 			case '+':
197 				ptr++;
198 				break;
199 			}
200 
201 			while (ptr < sz) {
202 				final byte v = digits10[b[ptr]];
203 				if (v < 0)
204 					break;
205 				r = (r * 10) + v;
206 				ptr++;
207 			}
208 		} catch (ArrayIndexOutOfBoundsException e) {
209 			// Not a valid digit.
210 		}
211 		if (ptrResult != null)
212 			ptrResult.value = ptr;
213 		return sign < 0 ? -r : r;
214 	}
215 
216 	/**
217 	 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
218 	 * <p>
219 	 * Digit sequences can begin with an optional run of spaces before the
220 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
221 	 * Any other characters will cause the method to stop and return the current
222 	 * result to the caller.
223 	 *
224 	 * @param b
225 	 *            buffer to scan.
226 	 * @param ptr
227 	 *            position within buffer to start parsing digits at.
228 	 * @param ptrResult
229 	 *            optional location to return the new ptr value through. If null
230 	 *            the ptr value will be discarded.
231 	 * @return the value at this location; 0 if the location is not a valid
232 	 *         numeric.
233 	 */
234 	public static final long parseLongBase10(final byte[] b, int ptr,
235 			final MutableInteger ptrResult) {
236 		long r = 0;
237 		int sign = 0;
238 		try {
239 			final int sz = b.length;
240 			while (ptr < sz && b[ptr] == ' ')
241 				ptr++;
242 			if (ptr >= sz)
243 				return 0;
244 
245 			switch (b[ptr]) {
246 			case '-':
247 				sign = -1;
248 				ptr++;
249 				break;
250 			case '+':
251 				ptr++;
252 				break;
253 			}
254 
255 			while (ptr < sz) {
256 				final byte v = digits10[b[ptr]];
257 				if (v < 0)
258 					break;
259 				r = (r * 10) + v;
260 				ptr++;
261 			}
262 		} catch (ArrayIndexOutOfBoundsException e) {
263 			// Not a valid digit.
264 		}
265 		if (ptrResult != null)
266 			ptrResult.value = ptr;
267 		return sign < 0 ? -r : r;
268 	}
269 
270 	/**
271 	 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
272 	 * <p>
273 	 * The number is read in network byte order, that is, most significant
274 	 * nybble first.
275 	 *
276 	 * @param bs
277 	 *            buffer to parse digits from; positions {@code [p, p+4)} will
278 	 *            be parsed.
279 	 * @param p
280 	 *            first position within the buffer to parse.
281 	 * @return the integer value.
282 	 * @throws java.lang.ArrayIndexOutOfBoundsException
283 	 *             if the string is not hex formatted.
284 	 */
285 	public static final int parseHexInt16(final byte[] bs, final int p) {
286 		int r = digits16[bs[p]] << 4;
287 
288 		r |= digits16[bs[p + 1]];
289 		r <<= 4;
290 
291 		r |= digits16[bs[p + 2]];
292 		r <<= 4;
293 
294 		r |= digits16[bs[p + 3]];
295 		if (r < 0)
296 			throw new ArrayIndexOutOfBoundsException();
297 		return r;
298 	}
299 
300 	/**
301 	 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
302 	 * <p>
303 	 * The number is read in network byte order, that is, most significant
304 	 * nybble first.
305 	 *
306 	 * @param bs
307 	 *            buffer to parse digits from; positions {@code [p, p+8)} will
308 	 *            be parsed.
309 	 * @param p
310 	 *            first position within the buffer to parse.
311 	 * @return the integer value.
312 	 * @throws java.lang.ArrayIndexOutOfBoundsException
313 	 *             if the string is not hex formatted.
314 	 */
315 	public static final int parseHexInt32(final byte[] bs, final int p) {
316 		int r = digits16[bs[p]] << 4;
317 
318 		r |= digits16[bs[p + 1]];
319 		r <<= 4;
320 
321 		r |= digits16[bs[p + 2]];
322 		r <<= 4;
323 
324 		r |= digits16[bs[p + 3]];
325 		r <<= 4;
326 
327 		r |= digits16[bs[p + 4]];
328 		r <<= 4;
329 
330 		r |= digits16[bs[p + 5]];
331 		r <<= 4;
332 
333 		r |= digits16[bs[p + 6]];
334 
335 		final int last = digits16[bs[p + 7]];
336 		if (r < 0 || last < 0)
337 			throw new ArrayIndexOutOfBoundsException();
338 		return (r << 4) | last;
339 	}
340 
341 	/**
342 	 * Parse 16 character base 16 (hex) formatted string to unsigned long.
343 	 * <p>
344 	 * The number is read in network byte order, that is, most significant
345 	 * nibble first.
346 	 *
347 	 * @param bs
348 	 *            buffer to parse digits from; positions {@code [p, p+16)} will
349 	 *            be parsed.
350 	 * @param p
351 	 *            first position within the buffer to parse.
352 	 * @return the integer value.
353 	 * @throws java.lang.ArrayIndexOutOfBoundsException
354 	 *             if the string is not hex formatted.
355 	 * @since 4.3
356 	 */
357 	public static final long parseHexInt64(final byte[] bs, final int p) {
358 		long r = digits16[bs[p]] << 4;
359 
360 		r |= digits16[bs[p + 1]];
361 		r <<= 4;
362 
363 		r |= digits16[bs[p + 2]];
364 		r <<= 4;
365 
366 		r |= digits16[bs[p + 3]];
367 		r <<= 4;
368 
369 		r |= digits16[bs[p + 4]];
370 		r <<= 4;
371 
372 		r |= digits16[bs[p + 5]];
373 		r <<= 4;
374 
375 		r |= digits16[bs[p + 6]];
376 		r <<= 4;
377 
378 		r |= digits16[bs[p + 7]];
379 		r <<= 4;
380 
381 		r |= digits16[bs[p + 8]];
382 		r <<= 4;
383 
384 		r |= digits16[bs[p + 9]];
385 		r <<= 4;
386 
387 		r |= digits16[bs[p + 10]];
388 		r <<= 4;
389 
390 		r |= digits16[bs[p + 11]];
391 		r <<= 4;
392 
393 		r |= digits16[bs[p + 12]];
394 		r <<= 4;
395 
396 		r |= digits16[bs[p + 13]];
397 		r <<= 4;
398 
399 		r |= digits16[bs[p + 14]];
400 
401 		final int last = digits16[bs[p + 15]];
402 		if (r < 0 || last < 0)
403 			throw new ArrayIndexOutOfBoundsException();
404 		return (r << 4) | last;
405 	}
406 
407 	/**
408 	 * Parse a single hex digit to its numeric value (0-15).
409 	 *
410 	 * @param digit
411 	 *            hex character to parse.
412 	 * @return numeric value, in the range 0-15.
413 	 * @throws java.lang.ArrayIndexOutOfBoundsException
414 	 *             if the input digit is not a valid hex digit.
415 	 */
416 	public static final int parseHexInt4(final byte digit) {
417 		final byte r = digits16[digit];
418 		if (r < 0)
419 			throw new ArrayIndexOutOfBoundsException();
420 		return r;
421 	}
422 
423 	/**
424 	 * Parse a Git style timezone string.
425 	 * <p>
426 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
427 	 * lower two positions count minutes, not 100ths of an hour.
428 	 *
429 	 * @param b
430 	 *            buffer to scan.
431 	 * @param ptr
432 	 *            position within buffer to start parsing digits at.
433 	 * @return the timezone at this location, expressed in minutes.
434 	 */
435 	public static final int parseTimeZoneOffset(byte[] b, int ptr) {
436 		return parseTimeZoneOffset(b, ptr, null);
437 	}
438 
439 	/**
440 	 * Parse a Git style timezone string.
441 	 * <p>
442 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
443 	 * lower two positions count minutes, not 100ths of an hour.
444 	 *
445 	 * @param b
446 	 *            buffer to scan.
447 	 * @param ptr
448 	 *            position within buffer to start parsing digits at.
449 	 * @param ptrResult
450 	 *            optional location to return the new ptr value through. If null
451 	 *            the ptr value will be discarded.
452 	 * @return the timezone at this location, expressed in minutes.
453 	 * @since 4.1
454 	 */
455 	public static final int parseTimeZoneOffset(final byte[] b, int ptr,
456 			MutableInteger ptrResult) {
457 		final int v = parseBase10(b, ptr, ptrResult);
458 		final int tzMins = v % 100;
459 		final int tzHours = v / 100;
460 		return tzHours * 60 + tzMins;
461 	}
462 
463 	/**
464 	 * Locate the first position after a given character.
465 	 *
466 	 * @param b
467 	 *            buffer to scan.
468 	 * @param ptr
469 	 *            position within buffer to start looking for chrA at.
470 	 * @param chrA
471 	 *            character to find.
472 	 * @return new position just after chrA.
473 	 */
474 	public static final int next(byte[] b, int ptr, char chrA) {
475 		final int sz = b.length;
476 		while (ptr < sz) {
477 			if (b[ptr++] == chrA)
478 				return ptr;
479 		}
480 		return ptr;
481 	}
482 
483 	/**
484 	 * Locate the first position after the next LF.
485 	 * <p>
486 	 * This method stops on the first '\n' it finds.
487 	 *
488 	 * @param b
489 	 *            buffer to scan.
490 	 * @param ptr
491 	 *            position within buffer to start looking for LF at.
492 	 * @return new position just after the first LF found.
493 	 */
494 	public static final int nextLF(byte[] b, int ptr) {
495 		return next(b, ptr, '\n');
496 	}
497 
498 	/**
499 	 * Locate the first position after either the given character or LF.
500 	 * <p>
501 	 * This method stops on the first match it finds from either chrA or '\n'.
502 	 *
503 	 * @param b
504 	 *            buffer to scan.
505 	 * @param ptr
506 	 *            position within buffer to start looking for chrA or LF at.
507 	 * @param chrA
508 	 *            character to find.
509 	 * @return new position just after the first chrA or LF to be found.
510 	 */
511 	public static final int nextLF(byte[] b, int ptr, char chrA) {
512 		final int sz = b.length;
513 		while (ptr < sz) {
514 			final byte c = b[ptr++];
515 			if (c == chrA || c == '\n')
516 				return ptr;
517 		}
518 		return ptr;
519 	}
520 
521 	/**
522 	 * Locate the end of the header.  Note that headers may be
523 	 * more than one line long.
524 	 * @param b
525 	 *            buffer to scan.
526 	 * @param ptr
527 	 *            position within buffer to start looking for the end-of-header.
528 	 * @return new position just after the header.  This is either
529 	 * b.length, or the index of the header's terminating newline.
530 	 * @since 5.1
531 	 */
532 	public static final int headerEnd(final byte[] b, int ptr) {
533 		final int sz = b.length;
534 		while (ptr < sz) {
535 			final byte c = b[ptr++];
536 			if (c == '\n' && (ptr == sz || b[ptr] != ' ')) {
537 				return ptr - 1;
538 			}
539 		}
540 		return ptr - 1;
541 	}
542 
543 	/**
544 	 * Find the start of the contents of a given header.
545 	 *
546 	 * @param b
547 	 *            buffer to scan.
548 	 * @param headerName
549 	 *            header to search for
550 	 * @param ptr
551 	 *            position within buffer to start looking for header at.
552 	 * @return new position at the start of the header's contents, -1 for
553 	 *         not found
554 	 * @since 5.1
555 	 */
556 	public static final int headerStart(byte[] headerName, byte[] b, int ptr) {
557 		// Start by advancing to just past a LF or buffer start
558 		if (ptr != 0) {
559 			ptr = nextLF(b, ptr - 1);
560 		}
561 		while (ptr < b.length - (headerName.length + 1)) {
562 			boolean found = true;
563 			for (byte element : headerName) {
564 				if (element != b[ptr++]) {
565 					found = false;
566 					break;
567 				}
568 			}
569 			if (found && b[ptr++] == ' ') {
570 				return ptr;
571 			}
572 			ptr = nextLF(b, ptr);
573 		}
574 		return -1;
575 	}
576 
577 	/**
578 	 * Locate the first position before a given character.
579 	 *
580 	 * @param b
581 	 *            buffer to scan.
582 	 * @param ptr
583 	 *            position within buffer to start looking for chrA at.
584 	 * @param chrA
585 	 *            character to find.
586 	 * @return new position just before chrA, -1 for not found
587 	 */
588 	public static final int prev(byte[] b, int ptr, char chrA) {
589 		if (ptr == b.length)
590 			--ptr;
591 		while (ptr >= 0) {
592 			if (b[ptr--] == chrA)
593 				return ptr;
594 		}
595 		return ptr;
596 	}
597 
598 	/**
599 	 * Locate the first position before the previous LF.
600 	 * <p>
601 	 * This method stops on the first '\n' it finds.
602 	 *
603 	 * @param b
604 	 *            buffer to scan.
605 	 * @param ptr
606 	 *            position within buffer to start looking for LF at.
607 	 * @return new position just before the first LF found, -1 for not found
608 	 */
609 	public static final int prevLF(byte[] b, int ptr) {
610 		return prev(b, ptr, '\n');
611 	}
612 
613 	/**
614 	 * Locate the previous position before either the given character or LF.
615 	 * <p>
616 	 * This method stops on the first match it finds from either chrA or '\n'.
617 	 *
618 	 * @param b
619 	 *            buffer to scan.
620 	 * @param ptr
621 	 *            position within buffer to start looking for chrA or LF at.
622 	 * @param chrA
623 	 *            character to find.
624 	 * @return new position just before the first chrA or LF to be found, -1 for
625 	 *         not found
626 	 */
627 	public static final int prevLF(byte[] b, int ptr, char chrA) {
628 		if (ptr == b.length)
629 			--ptr;
630 		while (ptr >= 0) {
631 			final byte c = b[ptr--];
632 			if (c == chrA || c == '\n')
633 				return ptr;
634 		}
635 		return ptr;
636 	}
637 
638 	/**
639 	 * Index the region between <code>[ptr, end)</code> to find line starts.
640 	 * <p>
641 	 * The returned list is 1 indexed. Index 0 contains
642 	 * {@link java.lang.Integer#MIN_VALUE} to pad the list out.
643 	 * <p>
644 	 * Using a 1 indexed list means that line numbers can be directly accessed
645 	 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
646 	 * <code>ptr</code>.
647 	 * <p>
648 	 * The last element (index <code>map.size()-1</code>) always contains
649 	 * <code>end</code>.
650 	 *
651 	 * @param buf
652 	 *            buffer to scan.
653 	 * @param ptr
654 	 *            position within the buffer corresponding to the first byte of
655 	 *            line 1.
656 	 * @param end
657 	 *            1 past the end of the content within <code>buf</code>.
658 	 * @return a line map indicating the starting position of each line.
659 	 */
660 	public static final IntList lineMap(byte[] buf, int ptr, int end) {
661 		IntList map = new IntList((end - ptr) / 36);
662 		map.fillTo(1, Integer.MIN_VALUE);
663 		for (; ptr < end; ptr = nextLF(buf, ptr)) {
664 			map.add(ptr);
665 		}
666 		map.add(end);
667 		return map;
668 	}
669 
670 	/**
671 	 * Like {@link #lineMap(byte[], int, int)} but throw
672 	 * {@link BinaryBlobException} if a NUL byte is encountered.
673 	 *
674 	 * @param buf
675 	 *            buffer to scan.
676 	 * @param ptr
677 	 *            position within the buffer corresponding to the first byte of
678 	 *            line 1.
679 	 * @param end
680 	 *            1 past the end of the content within <code>buf</code>.
681 	 * @return a line map indicating the starting position of each line.
682 	 * @throws BinaryBlobException
683 	 *             if a NUL byte or a lone CR is found.
684 	 * @since 5.0
685 	 */
686 	public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end)
687 			throws BinaryBlobException {
688 		// Experimentally derived from multiple source repositories
689 		// the average number of bytes/line is 36. Its a rough guess
690 		// to initially size our map close to the target.
691 		IntList map = new IntList((end - ptr) / 36);
692 		map.add(Integer.MIN_VALUE);
693 		byte last = '\n'; // Must be \n to add the initial ptr
694 		for (; ptr < end; ptr++) {
695 			if (last == '\n') {
696 				map.add(ptr);
697 			}
698 			byte curr = buf[ptr];
699 			if (RawText.isBinary(curr, last)) {
700 				throw new BinaryBlobException();
701 			}
702 			last = curr;
703 		}
704 		if (last == '\r') {
705 			// Counts as binary
706 			throw new BinaryBlobException();
707 		}
708 		map.add(end);
709 		return map;
710 	}
711 
712 	/**
713 	 * Locate the "author " header line data.
714 	 *
715 	 * @param b
716 	 *            buffer to scan.
717 	 * @param ptr
718 	 *            position in buffer to start the scan at. Most callers should
719 	 *            pass 0 to ensure the scan starts from the beginning of the
720 	 *            commit buffer and does not accidentally look at message body.
721 	 * @return position just after the space in "author ", so the first
722 	 *         character of the author's name. If no author header can be
723 	 *         located -1 is returned.
724 	 */
725 	public static final int author(byte[] b, int ptr) {
726 		final int sz = b.length;
727 		if (ptr == 0)
728 			ptr += 46; // skip the "tree ..." line.
729 		while (ptr < sz && b[ptr] == 'p')
730 			ptr += 48; // skip this parent.
731 		return match(b, ptr, author);
732 	}
733 
734 	/**
735 	 * Locate the "committer " header line data.
736 	 *
737 	 * @param b
738 	 *            buffer to scan.
739 	 * @param ptr
740 	 *            position in buffer to start the scan at. Most callers should
741 	 *            pass 0 to ensure the scan starts from the beginning of the
742 	 *            commit buffer and does not accidentally look at message body.
743 	 * @return position just after the space in "committer ", so the first
744 	 *         character of the committer's name. If no committer header can be
745 	 *         located -1 is returned.
746 	 */
747 	public static final int committer(byte[] b, int ptr) {
748 		final int sz = b.length;
749 		if (ptr == 0)
750 			ptr += 46; // skip the "tree ..." line.
751 		while (ptr < sz && b[ptr] == 'p')
752 			ptr += 48; // skip this parent.
753 		if (ptr < sz && b[ptr] == 'a')
754 			ptr = nextLF(b, ptr);
755 		return match(b, ptr, committer);
756 	}
757 
758 	/**
759 	 * Locate the "tagger " header line data.
760 	 *
761 	 * @param b
762 	 *            buffer to scan.
763 	 * @param ptr
764 	 *            position in buffer to start the scan at. Most callers should
765 	 *            pass 0 to ensure the scan starts from the beginning of the tag
766 	 *            buffer and does not accidentally look at message body.
767 	 * @return position just after the space in "tagger ", so the first
768 	 *         character of the tagger's name. If no tagger header can be
769 	 *         located -1 is returned.
770 	 */
771 	public static final int tagger(byte[] b, int ptr) {
772 		final int sz = b.length;
773 		if (ptr == 0)
774 			ptr += 48; // skip the "object ..." line.
775 		while (ptr < sz) {
776 			if (b[ptr] == '\n')
777 				return -1;
778 			final int m = match(b, ptr, tagger);
779 			if (m >= 0)
780 				return m;
781 			ptr = nextLF(b, ptr);
782 		}
783 		return -1;
784 	}
785 
786 	/**
787 	 * Locate the "encoding " header line.
788 	 *
789 	 * @param b
790 	 *            buffer to scan.
791 	 * @param ptr
792 	 *            position in buffer to start the scan at. Most callers should
793 	 *            pass 0 to ensure the scan starts from the beginning of the
794 	 *            buffer and does not accidentally look at the message body.
795 	 * @return position just after the space in "encoding ", so the first
796 	 *         character of the encoding's name. If no encoding header can be
797 	 *         located -1 is returned (and UTF-8 should be assumed).
798 	 */
799 	public static final int encoding(byte[] b, int ptr) {
800 		final int sz = b.length;
801 		while (ptr < sz) {
802 			if (b[ptr] == '\n')
803 				return -1;
804 			if (b[ptr] == 'e')
805 				break;
806 			ptr = nextLF(b, ptr);
807 		}
808 		return match(b, ptr, encoding);
809 	}
810 
811 	/**
812 	 * Parse the "encoding " header as a string.
813 	 * <p>
814 	 * Locates the "encoding " header (if present) and returns its value.
815 	 *
816 	 * @param b
817 	 *            buffer to scan.
818 	 * @return the encoding header as specified in the commit; null if the
819 	 *         header was not present and should be assumed.
820 	 * @since 4.2
821 	 */
822 	@Nullable
823 	public static String parseEncodingName(byte[] b) {
824 		int enc = encoding(b, 0);
825 		if (enc < 0) {
826 			return null;
827 		}
828 		int lf = nextLF(b, enc);
829 		return decode(UTF_8, b, enc, lf - 1);
830 	}
831 
832 	/**
833 	 * Parse the "encoding " header into a character set reference.
834 	 * <p>
835 	 * Locates the "encoding " header (if present) by first calling
836 	 * {@link #encoding(byte[], int)} and then returns the proper character set
837 	 * to apply to this buffer to evaluate its contents as character data.
838 	 * <p>
839 	 * If no encoding header is present {@code UTF-8} is assumed.
840 	 *
841 	 * @param b
842 	 *            buffer to scan.
843 	 * @return the Java character set representation. Never null.
844 	 * @throws IllegalCharsetNameException
845 	 *             if the character set requested by the encoding header is
846 	 *             malformed and unsupportable.
847 	 * @throws UnsupportedCharsetException
848 	 *             if the JRE does not support the character set requested by
849 	 *             the encoding header.
850 	 */
851 	public static Charset parseEncoding(byte[] b) {
852 		String enc = parseEncodingName(b);
853 		if (enc == null) {
854 			return UTF_8;
855 		}
856 
857 		String name = enc.trim();
858 		try {
859 			return Charset.forName(name);
860 		} catch (IllegalCharsetNameException
861 				| UnsupportedCharsetException badName) {
862 			Charset aliased = charsetForAlias(name);
863 			if (aliased != null) {
864 				return aliased;
865 			}
866 			throw badName;
867 		}
868 	}
869 
870 	/**
871 	 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
872 	 * <p>
873 	 * Leading spaces won't be trimmed from the string, i.e. will show up in the
874 	 * parsed name afterwards.
875 	 *
876 	 * @param in
877 	 *            the string to parse a name from.
878 	 * @return the parsed identity or null in case the identity could not be
879 	 *         parsed.
880 	 */
881 	public static PersonIdent parsePersonIdent(String in) {
882 		return parsePersonIdent(Constants.encode(in), 0);
883 	}
884 
885 	/**
886 	 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
887 	 * <p>
888 	 * When passing in a value for <code>nameB</code> callers should use the
889 	 * return value of {@link #author(byte[], int)} or
890 	 * {@link #committer(byte[], int)}, as these methods provide the proper
891 	 * position within the buffer.
892 	 *
893 	 * @param raw
894 	 *            the buffer to parse character data from.
895 	 * @param nameB
896 	 *            first position of the identity information. This should be the
897 	 *            first position after the space which delimits the header field
898 	 *            name (e.g. "author" or "committer") from the rest of the
899 	 *            identity line.
900 	 * @return the parsed identity or null in case the identity could not be
901 	 *         parsed.
902 	 */
903 	public static PersonIdent parsePersonIdent(byte[] raw, int nameB) {
904 		Charset cs;
905 		try {
906 			cs = parseEncoding(raw);
907 		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
908 			// Assume UTF-8 for person identities, usually this is correct.
909 			// If not decode() will fall back to the ISO-8859-1 encoding.
910 			cs = UTF_8;
911 		}
912 
913 		final int emailB = nextLF(raw, nameB, '<');
914 		final int emailE = nextLF(raw, emailB, '>');
915 		if (emailB >= raw.length || raw[emailB] == '\n' ||
916 				(emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
917 			return null;
918 
919 		final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
920 				emailB - 2 : emailB - 1;
921 		final String name = decode(cs, raw, nameB, nameEnd);
922 		final String email = decode(cs, raw, emailB, emailE - 1);
923 
924 		// Start searching from end of line, as after first name-email pair,
925 		// another name-email pair may occur. We will ignore all kinds of
926 		// "junk" following the first email.
927 		//
928 		// We've to use (emailE - 1) for the case that raw[email] is LF,
929 		// otherwise we would run too far. "-2" is necessary to position
930 		// before the LF in case of LF termination resp. the penultimate
931 		// character if there is no trailing LF.
932 		final int tzBegin = lastIndexOfTrim(raw, ' ',
933 				nextLF(raw, emailE - 1) - 2) + 1;
934 		if (tzBegin <= emailE) // No time/zone, still valid
935 			return new PersonIdent(name, email, 0, 0);
936 
937 		final int whenBegin = Math.max(emailE,
938 				lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
939 		if (whenBegin >= tzBegin - 1) // No time/zone, still valid
940 			return new PersonIdent(name, email, 0, 0);
941 
942 		final long when = parseLongBase10(raw, whenBegin, null);
943 		final int tz = parseTimeZoneOffset(raw, tzBegin);
944 		return new PersonIdent(name, email, when * 1000L, tz);
945 	}
946 
947 	/**
948 	 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
949 	 * <p>
950 	 * When passing in a value for <code>nameB</code> callers should use the
951 	 * return value of {@link #author(byte[], int)} or
952 	 * {@link #committer(byte[], int)}, as these methods provide the proper
953 	 * position within the buffer.
954 	 *
955 	 * @param raw
956 	 *            the buffer to parse character data from.
957 	 * @param nameB
958 	 *            first position of the identity information. This should be the
959 	 *            first position after the space which delimits the header field
960 	 *            name (e.g. "author" or "committer") from the rest of the
961 	 *            identity line.
962 	 * @return the parsed identity. Never null.
963 	 */
964 	public static PersonIdent parsePersonIdentOnly(final byte[] raw,
965 			final int nameB) {
966 		int stop = nextLF(raw, nameB);
967 		int emailB = nextLF(raw, nameB, '<');
968 		int emailE = nextLF(raw, emailB, '>');
969 		final String name;
970 		final String email;
971 		if (emailE < stop) {
972 			email = decode(raw, emailB, emailE - 1);
973 		} else {
974 			email = "invalid"; //$NON-NLS-1$
975 		}
976 		if (emailB < stop)
977 			name = decode(raw, nameB, emailB - 2);
978 		else
979 			name = decode(raw, nameB, stop);
980 
981 		final MutableInteger ptrout = new MutableInteger();
982 		long when;
983 		int tz;
984 		if (emailE < stop) {
985 			when = parseLongBase10(raw, emailE + 1, ptrout);
986 			tz = parseTimeZoneOffset(raw, ptrout.value);
987 		} else {
988 			when = 0;
989 			tz = 0;
990 		}
991 		return new PersonIdent(name, email, when * 1000L, tz);
992 	}
993 
994 	/**
995 	 * Locate the end of a footer line key string.
996 	 * <p>
997 	 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
998 	 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
999 	 * the first ':'.
1000 	 * <p>
1001 	 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
1002 	 * then this method returns -1.
1003 	 *
1004 	 * @param raw
1005 	 *            buffer to scan.
1006 	 * @param ptr
1007 	 *            first position within raw to consider as a footer line key.
1008 	 * @return position of the ':' which terminates the footer line key if this
1009 	 *         is otherwise a valid footer line key; otherwise -1.
1010 	 */
1011 	public static int endOfFooterLineKey(byte[] raw, int ptr) {
1012 		try {
1013 			for (;;) {
1014 				final byte c = raw[ptr];
1015 				if (footerLineKeyChars[c] == 0) {
1016 					if (c == ':')
1017 						return ptr;
1018 					return -1;
1019 				}
1020 				ptr++;
1021 			}
1022 		} catch (ArrayIndexOutOfBoundsException e) {
1023 			return -1;
1024 		}
1025 	}
1026 
1027 	/**
1028 	 * Decode a buffer under UTF-8, if possible.
1029 	 *
1030 	 * If the byte stream cannot be decoded that way, the platform default is tried
1031 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1032 	 *
1033 	 * @param buffer
1034 	 *            buffer to pull raw bytes from.
1035 	 * @return a string representation of the range <code>[start,end)</code>,
1036 	 *         after decoding the region through the specified character set.
1037 	 */
1038 	public static String decode(byte[] buffer) {
1039 		return decode(buffer, 0, buffer.length);
1040 	}
1041 
1042 	/**
1043 	 * Decode a buffer under UTF-8, if possible.
1044 	 *
1045 	 * If the byte stream cannot be decoded that way, the platform default is
1046 	 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1047 	 *
1048 	 * @param buffer
1049 	 *            buffer to pull raw bytes from.
1050 	 * @param start
1051 	 *            start position in buffer
1052 	 * @param end
1053 	 *            one position past the last location within the buffer to take
1054 	 *            data from.
1055 	 * @return a string representation of the range <code>[start,end)</code>,
1056 	 *         after decoding the region through the specified character set.
1057 	 */
1058 	public static String decode(final byte[] buffer, final int start,
1059 			final int end) {
1060 		return decode(UTF_8, buffer, start, end);
1061 	}
1062 
1063 	/**
1064 	 * Decode a buffer under the specified character set if possible.
1065 	 *
1066 	 * If the byte stream cannot be decoded that way, the platform default is tried
1067 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1068 	 *
1069 	 * @param cs
1070 	 *            character set to use when decoding the buffer.
1071 	 * @param buffer
1072 	 *            buffer to pull raw bytes from.
1073 	 * @return a string representation of the range <code>[start,end)</code>,
1074 	 *         after decoding the region through the specified character set.
1075 	 */
1076 	public static String decode(Charset cs, byte[] buffer) {
1077 		return decode(cs, buffer, 0, buffer.length);
1078 	}
1079 
1080 	/**
1081 	 * Decode a region of the buffer under the specified character set if possible.
1082 	 *
1083 	 * If the byte stream cannot be decoded that way, the platform default is tried
1084 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1085 	 *
1086 	 * @param cs
1087 	 *            character set to use when decoding the buffer.
1088 	 * @param buffer
1089 	 *            buffer to pull raw bytes from.
1090 	 * @param start
1091 	 *            first position within the buffer to take data from.
1092 	 * @param end
1093 	 *            one position past the last location within the buffer to take
1094 	 *            data from.
1095 	 * @return a string representation of the range <code>[start,end)</code>,
1096 	 *         after decoding the region through the specified character set.
1097 	 */
1098 	public static String decode(final Charset cs, final byte[] buffer,
1099 			final int start, final int end) {
1100 		try {
1101 			return decodeNoFallback(cs, buffer, start, end);
1102 		} catch (CharacterCodingException e) {
1103 			// Fall back to an ISO-8859-1 style encoding. At least all of
1104 			// the bytes will be present in the output.
1105 			//
1106 			return extractBinaryString(buffer, start, end);
1107 		}
1108 	}
1109 
1110 	/**
1111 	 * Decode a region of the buffer under the specified character set if
1112 	 * possible.
1113 	 *
1114 	 * If the byte stream cannot be decoded that way, the platform default is
1115 	 * tried and if that too fails, an exception is thrown.
1116 	 *
1117 	 * @param cs
1118 	 *            character set to use when decoding the buffer.
1119 	 * @param buffer
1120 	 *            buffer to pull raw bytes from.
1121 	 * @param start
1122 	 *            first position within the buffer to take data from.
1123 	 * @param end
1124 	 *            one position past the last location within the buffer to take
1125 	 *            data from.
1126 	 * @return a string representation of the range <code>[start,end)</code>,
1127 	 *         after decoding the region through the specified character set.
1128 	 * @throws java.nio.charset.CharacterCodingException
1129 	 *             the input is not in any of the tested character sets.
1130 	 */
1131 	public static String decodeNoFallback(final Charset cs,
1132 			final byte[] buffer, final int start, final int end)
1133 			throws CharacterCodingException {
1134 		ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
1135 		b.mark();
1136 
1137 		// Try our built-in favorite. The assumption here is that
1138 		// decoding will fail if the data is not actually encoded
1139 		// using that encoder.
1140 		try {
1141 			return decode(b, UTF_8);
1142 		} catch (CharacterCodingException e) {
1143 			b.reset();
1144 		}
1145 
1146 		if (!cs.equals(UTF_8)) {
1147 			// Try the suggested encoding, it might be right since it was
1148 			// provided by the caller.
1149 			try {
1150 				return decode(b, cs);
1151 			} catch (CharacterCodingException e) {
1152 				b.reset();
1153 			}
1154 		}
1155 
1156 		// Try the default character set. A small group of people
1157 		// might actually use the same (or very similar) locale.
1158 		Charset defcs = SystemReader.getInstance().getDefaultCharset();
1159 		if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
1160 			try {
1161 				return decode(b, defcs);
1162 			} catch (CharacterCodingException e) {
1163 				b.reset();
1164 			}
1165 		}
1166 
1167 		throw new CharacterCodingException();
1168 	}
1169 
1170 	/**
1171 	 * Decode a region of the buffer under the ISO-8859-1 encoding.
1172 	 *
1173 	 * Each byte is treated as a single character in the 8859-1 character
1174 	 * encoding, performing a raw binary-&gt;char conversion.
1175 	 *
1176 	 * @param buffer
1177 	 *            buffer to pull raw bytes from.
1178 	 * @param start
1179 	 *            first position within the buffer to take data from.
1180 	 * @param end
1181 	 *            one position past the last location within the buffer to take
1182 	 *            data from.
1183 	 * @return a string representation of the range <code>[start,end)</code>.
1184 	 */
1185 	public static String extractBinaryString(final byte[] buffer,
1186 			final int start, final int end) {
1187 		final StringBuilder r = new StringBuilder(end - start);
1188 		for (int i = start; i < end; i++)
1189 			r.append((char) (buffer[i] & 0xff));
1190 		return r.toString();
1191 	}
1192 
1193 	private static String decode(ByteBuffer b, Charset charset)
1194 			throws CharacterCodingException {
1195 		final CharsetDecoder d = charset.newDecoder();
1196 		d.onMalformedInput(CodingErrorAction.REPORT);
1197 		d.onUnmappableCharacter(CodingErrorAction.REPORT);
1198 		return d.decode(b).toString();
1199 	}
1200 
1201 	/**
1202 	 * Locate the position of the commit message body.
1203 	 *
1204 	 * @param b
1205 	 *            buffer to scan.
1206 	 * @param ptr
1207 	 *            position in buffer to start the scan at. Most callers should
1208 	 *            pass 0 to ensure the scan starts from the beginning of the
1209 	 *            commit buffer.
1210 	 * @return position of the user's message buffer.
1211 	 */
1212 	public static final int commitMessage(byte[] b, int ptr) {
1213 		final int sz = b.length;
1214 		if (ptr == 0)
1215 			ptr += 46; // skip the "tree ..." line.
1216 		while (ptr < sz && b[ptr] == 'p')
1217 			ptr += 48; // skip this parent.
1218 
1219 		// Skip any remaining header lines, ignoring what their actual
1220 		// header line type is. This is identical to the logic for a tag.
1221 		//
1222 		return tagMessage(b, ptr);
1223 	}
1224 
1225 	/**
1226 	 * Locate the position of the tag message body.
1227 	 *
1228 	 * @param b
1229 	 *            buffer to scan.
1230 	 * @param ptr
1231 	 *            position in buffer to start the scan at. Most callers should
1232 	 *            pass 0 to ensure the scan starts from the beginning of the tag
1233 	 *            buffer.
1234 	 * @return position of the user's message buffer.
1235 	 */
1236 	public static final int tagMessage(byte[] b, int ptr) {
1237 		final int sz = b.length;
1238 		if (ptr == 0)
1239 			ptr += 48; // skip the "object ..." line.
1240 		while (ptr < sz && b[ptr] != '\n')
1241 			ptr = nextLF(b, ptr);
1242 		if (ptr < sz && b[ptr] == '\n')
1243 			return ptr + 1;
1244 		return -1;
1245 	}
1246 
1247 	/**
1248 	 * Locate the end of a paragraph.
1249 	 * <p>
1250 	 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1251 	 *
1252 	 * @param b
1253 	 *            buffer to scan.
1254 	 * @param start
1255 	 *            position in buffer to start the scan at. Most callers will
1256 	 *            want to pass the first position of the commit message (as
1257 	 *            found by {@link #commitMessage(byte[], int)}.
1258 	 * @return position of the LF at the end of the paragraph;
1259 	 *         <code>b.length</code> if no paragraph end could be located.
1260 	 */
1261 	public static final int endOfParagraph(byte[] b, int start) {
1262 		int ptr = start;
1263 		final int sz = b.length;
1264 		while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1265 			ptr = nextLF(b, ptr);
1266 		if (ptr > start && b[ptr - 1] == '\n')
1267 			ptr--;
1268 		if (ptr > start && b[ptr - 1] == '\r')
1269 			ptr--;
1270 		return ptr;
1271 	}
1272 
1273 	/**
1274 	 * Get last index of {@code ch} in raw, trimming spaces.
1275 	 *
1276 	 * @param raw
1277 	 *            buffer to scan.
1278 	 * @param ch
1279 	 *            character to find.
1280 	 * @param pos
1281 	 *            starting position.
1282 	 * @return last index of {@code ch} in raw, trimming spaces.
1283 	 * @since 4.1
1284 	 */
1285 	public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1286 		while (pos >= 0 && raw[pos] == ' ')
1287 			pos--;
1288 
1289 		while (pos >= 0 && raw[pos] != ch)
1290 			pos--;
1291 
1292 		return pos;
1293 	}
1294 
1295 	private static Charset charsetForAlias(String name) {
1296 		return encodingAliases.get(StringUtils.toLowerCase(name));
1297 	}
1298 
1299 	private RawParseUtils() {
1300 		// Don't create instances of a static only utility.
1301 	}
1302 }