View Javadoc
1   /*
2    * Copyright (C) 2008-2009, Google Inc.
3    * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org> and others
4    *
5    * This program and the accompanying materials are made available under the
6    * terms of the Eclipse Distribution License v. 1.0 which is available at
7    * https://www.eclipse.org/org/documents/edl-v10.php.
8    *
9    * SPDX-License-Identifier: BSD-3-Clause
10   */
11  
12  package org.eclipse.jgit.util;
13  
14  import static java.nio.charset.StandardCharsets.ISO_8859_1;
15  import static java.nio.charset.StandardCharsets.UTF_8;
16  import static org.eclipse.jgit.lib.ObjectChecker.author;
17  import static org.eclipse.jgit.lib.ObjectChecker.committer;
18  import static org.eclipse.jgit.lib.ObjectChecker.encoding;
19  import static org.eclipse.jgit.lib.ObjectChecker.tagger;
20  
21  import java.nio.ByteBuffer;
22  import java.nio.charset.CharacterCodingException;
23  import java.nio.charset.Charset;
24  import java.nio.charset.CharsetDecoder;
25  import java.nio.charset.CodingErrorAction;
26  import java.nio.charset.IllegalCharsetNameException;
27  import java.nio.charset.UnsupportedCharsetException;
28  import java.util.Arrays;
29  import java.util.HashMap;
30  import java.util.Map;
31  
32  import org.eclipse.jgit.annotations.Nullable;
33  import org.eclipse.jgit.errors.BinaryBlobException;
34  import org.eclipse.jgit.lib.Constants;
35  import org.eclipse.jgit.lib.PersonIdent;
36  
37  /**
38   * Handy utility functions to parse raw object contents.
39   */
40  public final class RawParseUtils {
41  	/**
42  	 * UTF-8 charset constant.
43  	 *
44  	 * @since 2.2
45  	 * @deprecated use {@link java.nio.charset.StandardCharsets#UTF_8} instead
46  	 */
47  	@Deprecated
48  	public static final Charset UTF8_CHARSET = UTF_8;
49  
50  	private static final byte[] digits10;
51  
52  	private static final byte[] digits16;
53  
54  	private static final byte[] footerLineKeyChars;
55  
56  	private static final Map<String, Charset> encodingAliases;
57  
58  	static {
59  		encodingAliases = new HashMap<>();
60  		encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
61  		encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
62  
63  		digits10 = new byte['9' + 1];
64  		Arrays.fill(digits10, (byte) -1);
65  		for (char i = '0'; i <= '9'; i++)
66  			digits10[i] = (byte) (i - '0');
67  
68  		digits16 = new byte['f' + 1];
69  		Arrays.fill(digits16, (byte) -1);
70  		for (char i = '0'; i <= '9'; i++)
71  			digits16[i] = (byte) (i - '0');
72  		for (char i = 'a'; i <= 'f'; i++)
73  			digits16[i] = (byte) ((i - 'a') + 10);
74  		for (char i = 'A'; i <= 'F'; i++)
75  			digits16[i] = (byte) ((i - 'A') + 10);
76  
77  		footerLineKeyChars = new byte['z' + 1];
78  		footerLineKeyChars['-'] = 1;
79  		for (char i = '0'; i <= '9'; i++)
80  			footerLineKeyChars[i] = 1;
81  		for (char i = 'A'; i <= 'Z'; i++)
82  			footerLineKeyChars[i] = 1;
83  		for (char i = 'a'; i <= 'z'; i++)
84  			footerLineKeyChars[i] = 1;
85  	}
86  
87  	/**
88  	 * Determine if b[ptr] matches src.
89  	 *
90  	 * @param b
91  	 *            the buffer to scan.
92  	 * @param ptr
93  	 *            first position within b, this should match src[0].
94  	 * @param src
95  	 *            the buffer to test for equality with b.
96  	 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
97  	 */
98  	public static final int match(byte[] b, int ptr, byte[] src) {
99  		if (ptr + src.length > b.length)
100 			return -1;
101 		for (int i = 0; i < src.length; i++, ptr++)
102 			if (b[ptr] != src[i])
103 				return -1;
104 		return ptr;
105 	}
106 
107 	private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
108 			'6', '7', '8', '9' };
109 
110 	/**
111 	 * Format a base 10 numeric into a temporary buffer.
112 	 * <p>
113 	 * Formatting is performed backwards. The method starts at offset
114 	 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
115 	 * <code>digits</code> is the number of positions necessary to store the
116 	 * base 10 value.
117 	 * <p>
118 	 * The argument and return values from this method make it easy to chain
119 	 * writing, for example:
120 	 * </p>
121 	 *
122 	 * <pre>
123 	 * final byte[] tmp = new byte[64];
124 	 * int ptr = tmp.length;
125 	 * tmp[--ptr] = '\n';
126 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
127 	 * tmp[--ptr] = ' ';
128 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
129 	 * tmp[--ptr] = 0;
130 	 * final String str = new String(tmp, ptr, tmp.length - ptr);
131 	 * </pre>
132 	 *
133 	 * @param b
134 	 *            buffer to write into.
135 	 * @param o
136 	 *            one offset past the location where writing will begin; writing
137 	 *            proceeds towards lower index values.
138 	 * @param value
139 	 *            the value to store.
140 	 * @return the new offset value <code>o</code>. This is the position of
141 	 *         the last byte written. Additional writing should start at one
142 	 *         position earlier.
143 	 */
144 	public static int formatBase10(final byte[] b, int o, int value) {
145 		if (value == 0) {
146 			b[--o] = '0';
147 			return o;
148 		}
149 		final boolean isneg = value < 0;
150 		if (isneg)
151 			value = -value;
152 		while (value != 0) {
153 			b[--o] = base10byte[value % 10];
154 			value /= 10;
155 		}
156 		if (isneg)
157 			b[--o] = '-';
158 		return o;
159 	}
160 
161 	/**
162 	 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
163 	 * <p>
164 	 * Digit sequences can begin with an optional run of spaces before the
165 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
166 	 * Any other characters will cause the method to stop and return the current
167 	 * result to the caller.
168 	 *
169 	 * @param b
170 	 *            buffer to scan.
171 	 * @param ptr
172 	 *            position within buffer to start parsing digits at.
173 	 * @param ptrResult
174 	 *            optional location to return the new ptr value through. If null
175 	 *            the ptr value will be discarded.
176 	 * @return the value at this location; 0 if the location is not a valid
177 	 *         numeric.
178 	 */
179 	public static final int parseBase10(final byte[] b, int ptr,
180 			final MutableInteger ptrResult) {
181 		int r = 0;
182 		int sign = 0;
183 		try {
184 			final int sz = b.length;
185 			while (ptr < sz && b[ptr] == ' ')
186 				ptr++;
187 			if (ptr >= sz)
188 				return 0;
189 
190 			switch (b[ptr]) {
191 			case '-':
192 				sign = -1;
193 				ptr++;
194 				break;
195 			case '+':
196 				ptr++;
197 				break;
198 			}
199 
200 			while (ptr < sz) {
201 				final byte v = digits10[b[ptr]];
202 				if (v < 0)
203 					break;
204 				r = (r * 10) + v;
205 				ptr++;
206 			}
207 		} catch (ArrayIndexOutOfBoundsException e) {
208 			// Not a valid digit.
209 		}
210 		if (ptrResult != null)
211 			ptrResult.value = ptr;
212 		return sign < 0 ? -r : r;
213 	}
214 
215 	/**
216 	 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
217 	 * <p>
218 	 * Digit sequences can begin with an optional run of spaces before the
219 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
220 	 * Any other characters will cause the method to stop and return the current
221 	 * result to the caller.
222 	 *
223 	 * @param b
224 	 *            buffer to scan.
225 	 * @param ptr
226 	 *            position within buffer to start parsing digits at.
227 	 * @param ptrResult
228 	 *            optional location to return the new ptr value through. If null
229 	 *            the ptr value will be discarded.
230 	 * @return the value at this location; 0 if the location is not a valid
231 	 *         numeric.
232 	 */
233 	public static final long parseLongBase10(final byte[] b, int ptr,
234 			final MutableInteger ptrResult) {
235 		long r = 0;
236 		int sign = 0;
237 		try {
238 			final int sz = b.length;
239 			while (ptr < sz && b[ptr] == ' ')
240 				ptr++;
241 			if (ptr >= sz)
242 				return 0;
243 
244 			switch (b[ptr]) {
245 			case '-':
246 				sign = -1;
247 				ptr++;
248 				break;
249 			case '+':
250 				ptr++;
251 				break;
252 			}
253 
254 			while (ptr < sz) {
255 				final byte v = digits10[b[ptr]];
256 				if (v < 0)
257 					break;
258 				r = (r * 10) + v;
259 				ptr++;
260 			}
261 		} catch (ArrayIndexOutOfBoundsException e) {
262 			// Not a valid digit.
263 		}
264 		if (ptrResult != null)
265 			ptrResult.value = ptr;
266 		return sign < 0 ? -r : r;
267 	}
268 
269 	/**
270 	 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
271 	 * <p>
272 	 * The number is read in network byte order, that is, most significant
273 	 * nybble first.
274 	 *
275 	 * @param bs
276 	 *            buffer to parse digits from; positions {@code [p, p+4)} will
277 	 *            be parsed.
278 	 * @param p
279 	 *            first position within the buffer to parse.
280 	 * @return the integer value.
281 	 * @throws java.lang.ArrayIndexOutOfBoundsException
282 	 *             if the string is not hex formatted.
283 	 */
284 	public static final int parseHexInt16(final byte[] bs, final int p) {
285 		int r = digits16[bs[p]] << 4;
286 
287 		r |= digits16[bs[p + 1]];
288 		r <<= 4;
289 
290 		r |= digits16[bs[p + 2]];
291 		r <<= 4;
292 
293 		r |= digits16[bs[p + 3]];
294 		if (r < 0)
295 			throw new ArrayIndexOutOfBoundsException();
296 		return r;
297 	}
298 
299 	/**
300 	 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
301 	 * <p>
302 	 * The number is read in network byte order, that is, most significant
303 	 * nybble first.
304 	 *
305 	 * @param bs
306 	 *            buffer to parse digits from; positions {@code [p, p+8)} will
307 	 *            be parsed.
308 	 * @param p
309 	 *            first position within the buffer to parse.
310 	 * @return the integer value.
311 	 * @throws java.lang.ArrayIndexOutOfBoundsException
312 	 *             if the string is not hex formatted.
313 	 */
314 	public static final int parseHexInt32(final byte[] bs, final int p) {
315 		int r = digits16[bs[p]] << 4;
316 
317 		r |= digits16[bs[p + 1]];
318 		r <<= 4;
319 
320 		r |= digits16[bs[p + 2]];
321 		r <<= 4;
322 
323 		r |= digits16[bs[p + 3]];
324 		r <<= 4;
325 
326 		r |= digits16[bs[p + 4]];
327 		r <<= 4;
328 
329 		r |= digits16[bs[p + 5]];
330 		r <<= 4;
331 
332 		r |= digits16[bs[p + 6]];
333 
334 		final int last = digits16[bs[p + 7]];
335 		if (r < 0 || last < 0)
336 			throw new ArrayIndexOutOfBoundsException();
337 		return (r << 4) | last;
338 	}
339 
340 	/**
341 	 * Parse 16 character base 16 (hex) formatted string to unsigned long.
342 	 * <p>
343 	 * The number is read in network byte order, that is, most significant
344 	 * nibble first.
345 	 *
346 	 * @param bs
347 	 *            buffer to parse digits from; positions {@code [p, p+16)} will
348 	 *            be parsed.
349 	 * @param p
350 	 *            first position within the buffer to parse.
351 	 * @return the integer value.
352 	 * @throws java.lang.ArrayIndexOutOfBoundsException
353 	 *             if the string is not hex formatted.
354 	 * @since 4.3
355 	 */
356 	public static final long parseHexInt64(final byte[] bs, final int p) {
357 		long r = digits16[bs[p]] << 4;
358 
359 		r |= digits16[bs[p + 1]];
360 		r <<= 4;
361 
362 		r |= digits16[bs[p + 2]];
363 		r <<= 4;
364 
365 		r |= digits16[bs[p + 3]];
366 		r <<= 4;
367 
368 		r |= digits16[bs[p + 4]];
369 		r <<= 4;
370 
371 		r |= digits16[bs[p + 5]];
372 		r <<= 4;
373 
374 		r |= digits16[bs[p + 6]];
375 		r <<= 4;
376 
377 		r |= digits16[bs[p + 7]];
378 		r <<= 4;
379 
380 		r |= digits16[bs[p + 8]];
381 		r <<= 4;
382 
383 		r |= digits16[bs[p + 9]];
384 		r <<= 4;
385 
386 		r |= digits16[bs[p + 10]];
387 		r <<= 4;
388 
389 		r |= digits16[bs[p + 11]];
390 		r <<= 4;
391 
392 		r |= digits16[bs[p + 12]];
393 		r <<= 4;
394 
395 		r |= digits16[bs[p + 13]];
396 		r <<= 4;
397 
398 		r |= digits16[bs[p + 14]];
399 
400 		final int last = digits16[bs[p + 15]];
401 		if (r < 0 || last < 0)
402 			throw new ArrayIndexOutOfBoundsException();
403 		return (r << 4) | last;
404 	}
405 
406 	/**
407 	 * Parse a single hex digit to its numeric value (0-15).
408 	 *
409 	 * @param digit
410 	 *            hex character to parse.
411 	 * @return numeric value, in the range 0-15.
412 	 * @throws java.lang.ArrayIndexOutOfBoundsException
413 	 *             if the input digit is not a valid hex digit.
414 	 */
415 	public static final int parseHexInt4(final byte digit) {
416 		final byte r = digits16[digit];
417 		if (r < 0)
418 			throw new ArrayIndexOutOfBoundsException();
419 		return r;
420 	}
421 
422 	/**
423 	 * Parse a Git style timezone string.
424 	 * <p>
425 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
426 	 * lower two positions count minutes, not 100ths of an hour.
427 	 *
428 	 * @param b
429 	 *            buffer to scan.
430 	 * @param ptr
431 	 *            position within buffer to start parsing digits at.
432 	 * @return the timezone at this location, expressed in minutes.
433 	 */
434 	public static final int parseTimeZoneOffset(byte[] b, int ptr) {
435 		return parseTimeZoneOffset(b, ptr, null);
436 	}
437 
438 	/**
439 	 * Parse a Git style timezone string.
440 	 * <p>
441 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
442 	 * lower two positions count minutes, not 100ths of an hour.
443 	 *
444 	 * @param b
445 	 *            buffer to scan.
446 	 * @param ptr
447 	 *            position within buffer to start parsing digits at.
448 	 * @param ptrResult
449 	 *            optional location to return the new ptr value through. If null
450 	 *            the ptr value will be discarded.
451 	 * @return the timezone at this location, expressed in minutes.
452 	 * @since 4.1
453 	 */
454 	public static final int parseTimeZoneOffset(final byte[] b, int ptr,
455 			MutableInteger ptrResult) {
456 		final int v = parseBase10(b, ptr, ptrResult);
457 		final int tzMins = v % 100;
458 		final int tzHours = v / 100;
459 		return tzHours * 60 + tzMins;
460 	}
461 
462 	/**
463 	 * Locate the first position after a given character.
464 	 *
465 	 * @param b
466 	 *            buffer to scan.
467 	 * @param ptr
468 	 *            position within buffer to start looking for chrA at.
469 	 * @param chrA
470 	 *            character to find.
471 	 * @return new position just after chrA.
472 	 */
473 	public static final int next(byte[] b, int ptr, char chrA) {
474 		final int sz = b.length;
475 		while (ptr < sz) {
476 			if (b[ptr++] == chrA)
477 				return ptr;
478 		}
479 		return ptr;
480 	}
481 
482 	/**
483 	 * Locate the first position after the next LF.
484 	 * <p>
485 	 * This method stops on the first '\n' it finds.
486 	 *
487 	 * @param b
488 	 *            buffer to scan.
489 	 * @param ptr
490 	 *            position within buffer to start looking for LF at.
491 	 * @return new position just after the first LF found.
492 	 */
493 	public static final int nextLF(byte[] b, int ptr) {
494 		return next(b, ptr, '\n');
495 	}
496 
497 	/**
498 	 * Locate the first position after either the given character or LF.
499 	 * <p>
500 	 * This method stops on the first match it finds from either chrA or '\n'.
501 	 *
502 	 * @param b
503 	 *            buffer to scan.
504 	 * @param ptr
505 	 *            position within buffer to start looking for chrA or LF at.
506 	 * @param chrA
507 	 *            character to find.
508 	 * @return new position just after the first chrA or LF to be found.
509 	 */
510 	public static final int nextLF(byte[] b, int ptr, char chrA) {
511 		final int sz = b.length;
512 		while (ptr < sz) {
513 			final byte c = b[ptr++];
514 			if (c == chrA || c == '\n')
515 				return ptr;
516 		}
517 		return ptr;
518 	}
519 
520 	/**
521 	 * Locate the end of the header.  Note that headers may be
522 	 * more than one line long.
523 	 * @param b
524 	 *            buffer to scan.
525 	 * @param ptr
526 	 *            position within buffer to start looking for the end-of-header.
527 	 * @return new position just after the header.  This is either
528 	 * b.length, or the index of the header's terminating newline.
529 	 * @since 5.1
530 	 */
531 	public static final int headerEnd(final byte[] b, int ptr) {
532 		final int sz = b.length;
533 		while (ptr < sz) {
534 			final byte c = b[ptr++];
535 			if (c == '\n' && (ptr == sz || b[ptr] != ' ')) {
536 				return ptr - 1;
537 			}
538 		}
539 		return ptr - 1;
540 	}
541 
542 	/**
543 	 * Find the start of the contents of a given header.
544 	 *
545 	 * @param b
546 	 *            buffer to scan.
547 	 * @param headerName
548 	 *            header to search for
549 	 * @param ptr
550 	 *            position within buffer to start looking for header at.
551 	 * @return new position at the start of the header's contents, -1 for
552 	 *         not found
553 	 * @since 5.1
554 	 */
555 	public static final int headerStart(byte[] headerName, byte[] b, int ptr) {
556 		// Start by advancing to just past a LF or buffer start
557 		if (ptr != 0) {
558 			ptr = nextLF(b, ptr - 1);
559 		}
560 		while (ptr < b.length - (headerName.length + 1)) {
561 			boolean found = true;
562 			for (byte element : headerName) {
563 				if (element != b[ptr++]) {
564 					found = false;
565 					break;
566 				}
567 			}
568 			if (found && b[ptr++] == ' ') {
569 				return ptr;
570 			}
571 			ptr = nextLF(b, ptr);
572 		}
573 		return -1;
574 	}
575 
576 	/**
577 	 * Locate the first position before a given character.
578 	 *
579 	 * @param b
580 	 *            buffer to scan.
581 	 * @param ptr
582 	 *            position within buffer to start looking for chrA at.
583 	 * @param chrA
584 	 *            character to find.
585 	 * @return new position just before chrA, -1 for not found
586 	 */
587 	public static final int prev(byte[] b, int ptr, char chrA) {
588 		if (ptr == b.length)
589 			--ptr;
590 		while (ptr >= 0) {
591 			if (b[ptr--] == chrA)
592 				return ptr;
593 		}
594 		return ptr;
595 	}
596 
597 	/**
598 	 * Locate the first position before the previous LF.
599 	 * <p>
600 	 * This method stops on the first '\n' it finds.
601 	 *
602 	 * @param b
603 	 *            buffer to scan.
604 	 * @param ptr
605 	 *            position within buffer to start looking for LF at.
606 	 * @return new position just before the first LF found, -1 for not found
607 	 */
608 	public static final int prevLF(byte[] b, int ptr) {
609 		return prev(b, ptr, '\n');
610 	}
611 
612 	/**
613 	 * Locate the previous position before either the given character or LF.
614 	 * <p>
615 	 * This method stops on the first match it finds from either chrA or '\n'.
616 	 *
617 	 * @param b
618 	 *            buffer to scan.
619 	 * @param ptr
620 	 *            position within buffer to start looking for chrA or LF at.
621 	 * @param chrA
622 	 *            character to find.
623 	 * @return new position just before the first chrA or LF to be found, -1 for
624 	 *         not found
625 	 */
626 	public static final int prevLF(byte[] b, int ptr, char chrA) {
627 		if (ptr == b.length)
628 			--ptr;
629 		while (ptr >= 0) {
630 			final byte c = b[ptr--];
631 			if (c == chrA || c == '\n')
632 				return ptr;
633 		}
634 		return ptr;
635 	}
636 
637 	/**
638 	 * Index the region between <code>[ptr, end)</code> to find line starts.
639 	 * <p>
640 	 * The returned list is 1 indexed. Index 0 contains
641 	 * {@link java.lang.Integer#MIN_VALUE} to pad the list out.
642 	 * <p>
643 	 * Using a 1 indexed list means that line numbers can be directly accessed
644 	 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
645 	 * <code>ptr</code>.
646 	 * <p>
647 	 * The last element (index <code>map.size()-1</code>) always contains
648 	 * <code>end</code>.
649 	 *
650 	 * @param buf
651 	 *            buffer to scan.
652 	 * @param ptr
653 	 *            position within the buffer corresponding to the first byte of
654 	 *            line 1.
655 	 * @param end
656 	 *            1 past the end of the content within <code>buf</code>.
657 	 * @return a line map indicating the starting position of each line.
658 	 */
659 	public static final IntList lineMap(byte[] buf, int ptr, int end) {
660 		IntList map = new IntList((end - ptr) / 36);
661 		map.fillTo(1, Integer.MIN_VALUE);
662 		for (; ptr < end; ptr = nextLF(buf, ptr)) {
663 			map.add(ptr);
664 		}
665 		map.add(end);
666 		return map;
667 	}
668 
669 	/**
670 	 * Like {@link #lineMap(byte[], int, int)} but throw
671 	 * {@link BinaryBlobException} if a NUL byte is encountered.
672 	 *
673 	 * @param buf
674 	 *            buffer to scan.
675 	 * @param ptr
676 	 *            position within the buffer corresponding to the first byte of
677 	 *            line 1.
678 	 * @param end
679 	 *            1 past the end of the content within <code>buf</code>.
680 	 * @return a line map indicating the starting position of each line.
681 	 * @throws BinaryBlobException
682 	 *            if a NUL byte is found.
683 	 * @since 5.0
684 	 */
685 	public static final IntList lineMapOrBinary(byte[] buf, int ptr, int end)
686 			throws BinaryBlobException {
687 		IntList map = lineMapOrNull(buf, ptr, end);
688 		if (map == null) {
689 			throw new BinaryBlobException();
690 		}
691 		return map;
692 	}
693 
694 	@Nullable
695 	private static IntList lineMapOrNull(byte[] buf, int ptr, int end) {
696 		// Experimentally derived from multiple source repositories
697 		// the average number of bytes/line is 36. Its a rough guess
698 		// to initially size our map close to the target.
699 		IntList map = new IntList((end - ptr) / 36);
700 		map.add(Integer.MIN_VALUE);
701 		boolean foundLF = true;
702 		for (; ptr < end; ptr++) {
703 			if (foundLF) {
704 				map.add(ptr);
705 			}
706 
707 			if (buf[ptr] == '\0') {
708 				return null;
709 			}
710 
711 			foundLF = (buf[ptr] == '\n');
712 		}
713 		map.add(end);
714 		return map;
715 	}
716 
717 	/**
718 	 * Locate the "author " header line data.
719 	 *
720 	 * @param b
721 	 *            buffer to scan.
722 	 * @param ptr
723 	 *            position in buffer to start the scan at. Most callers should
724 	 *            pass 0 to ensure the scan starts from the beginning of the
725 	 *            commit buffer and does not accidentally look at message body.
726 	 * @return position just after the space in "author ", so the first
727 	 *         character of the author's name. If no author header can be
728 	 *         located -1 is returned.
729 	 */
730 	public static final int author(byte[] b, int ptr) {
731 		final int sz = b.length;
732 		if (ptr == 0)
733 			ptr += 46; // skip the "tree ..." line.
734 		while (ptr < sz && b[ptr] == 'p')
735 			ptr += 48; // skip this parent.
736 		return match(b, ptr, author);
737 	}
738 
739 	/**
740 	 * Locate the "committer " header line data.
741 	 *
742 	 * @param b
743 	 *            buffer to scan.
744 	 * @param ptr
745 	 *            position in buffer to start the scan at. Most callers should
746 	 *            pass 0 to ensure the scan starts from the beginning of the
747 	 *            commit buffer and does not accidentally look at message body.
748 	 * @return position just after the space in "committer ", so the first
749 	 *         character of the committer's name. If no committer header can be
750 	 *         located -1 is returned.
751 	 */
752 	public static final int committer(byte[] b, int ptr) {
753 		final int sz = b.length;
754 		if (ptr == 0)
755 			ptr += 46; // skip the "tree ..." line.
756 		while (ptr < sz && b[ptr] == 'p')
757 			ptr += 48; // skip this parent.
758 		if (ptr < sz && b[ptr] == 'a')
759 			ptr = nextLF(b, ptr);
760 		return match(b, ptr, committer);
761 	}
762 
763 	/**
764 	 * Locate the "tagger " header line data.
765 	 *
766 	 * @param b
767 	 *            buffer to scan.
768 	 * @param ptr
769 	 *            position in buffer to start the scan at. Most callers should
770 	 *            pass 0 to ensure the scan starts from the beginning of the tag
771 	 *            buffer and does not accidentally look at message body.
772 	 * @return position just after the space in "tagger ", so the first
773 	 *         character of the tagger's name. If no tagger header can be
774 	 *         located -1 is returned.
775 	 */
776 	public static final int tagger(byte[] b, int ptr) {
777 		final int sz = b.length;
778 		if (ptr == 0)
779 			ptr += 48; // skip the "object ..." line.
780 		while (ptr < sz) {
781 			if (b[ptr] == '\n')
782 				return -1;
783 			final int m = match(b, ptr, tagger);
784 			if (m >= 0)
785 				return m;
786 			ptr = nextLF(b, ptr);
787 		}
788 		return -1;
789 	}
790 
791 	/**
792 	 * Locate the "encoding " header line.
793 	 *
794 	 * @param b
795 	 *            buffer to scan.
796 	 * @param ptr
797 	 *            position in buffer to start the scan at. Most callers should
798 	 *            pass 0 to ensure the scan starts from the beginning of the
799 	 *            buffer and does not accidentally look at the message body.
800 	 * @return position just after the space in "encoding ", so the first
801 	 *         character of the encoding's name. If no encoding header can be
802 	 *         located -1 is returned (and UTF-8 should be assumed).
803 	 */
804 	public static final int encoding(byte[] b, int ptr) {
805 		final int sz = b.length;
806 		while (ptr < sz) {
807 			if (b[ptr] == '\n')
808 				return -1;
809 			if (b[ptr] == 'e')
810 				break;
811 			ptr = nextLF(b, ptr);
812 		}
813 		return match(b, ptr, encoding);
814 	}
815 
816 	/**
817 	 * Parse the "encoding " header as a string.
818 	 * <p>
819 	 * Locates the "encoding " header (if present) and returns its value.
820 	 *
821 	 * @param b
822 	 *            buffer to scan.
823 	 * @return the encoding header as specified in the commit; null if the
824 	 *         header was not present and should be assumed.
825 	 * @since 4.2
826 	 */
827 	@Nullable
828 	public static String parseEncodingName(byte[] b) {
829 		int enc = encoding(b, 0);
830 		if (enc < 0) {
831 			return null;
832 		}
833 		int lf = nextLF(b, enc);
834 		return decode(UTF_8, b, enc, lf - 1);
835 	}
836 
837 	/**
838 	 * Parse the "encoding " header into a character set reference.
839 	 * <p>
840 	 * Locates the "encoding " header (if present) by first calling
841 	 * {@link #encoding(byte[], int)} and then returns the proper character set
842 	 * to apply to this buffer to evaluate its contents as character data.
843 	 * <p>
844 	 * If no encoding header is present {@code UTF-8} is assumed.
845 	 *
846 	 * @param b
847 	 *            buffer to scan.
848 	 * @return the Java character set representation. Never null.
849 	 * @throws IllegalCharsetNameException
850 	 *             if the character set requested by the encoding header is
851 	 *             malformed and unsupportable.
852 	 * @throws UnsupportedCharsetException
853 	 *             if the JRE does not support the character set requested by
854 	 *             the encoding header.
855 	 */
856 	public static Charset parseEncoding(byte[] b) {
857 		String enc = parseEncodingName(b);
858 		if (enc == null) {
859 			return UTF_8;
860 		}
861 
862 		String name = enc.trim();
863 		try {
864 			return Charset.forName(name);
865 		} catch (IllegalCharsetNameException
866 				| UnsupportedCharsetException badName) {
867 			Charset aliased = charsetForAlias(name);
868 			if (aliased != null) {
869 				return aliased;
870 			}
871 			throw badName;
872 		}
873 	}
874 
875 	/**
876 	 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
877 	 * <p>
878 	 * Leading spaces won't be trimmed from the string, i.e. will show up in the
879 	 * parsed name afterwards.
880 	 *
881 	 * @param in
882 	 *            the string to parse a name from.
883 	 * @return the parsed identity or null in case the identity could not be
884 	 *         parsed.
885 	 */
886 	public static PersonIdent parsePersonIdent(String in) {
887 		return parsePersonIdent(Constants.encode(in), 0);
888 	}
889 
890 	/**
891 	 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
892 	 * <p>
893 	 * When passing in a value for <code>nameB</code> callers should use the
894 	 * return value of {@link #author(byte[], int)} or
895 	 * {@link #committer(byte[], int)}, as these methods provide the proper
896 	 * position within the buffer.
897 	 *
898 	 * @param raw
899 	 *            the buffer to parse character data from.
900 	 * @param nameB
901 	 *            first position of the identity information. This should be the
902 	 *            first position after the space which delimits the header field
903 	 *            name (e.g. "author" or "committer") from the rest of the
904 	 *            identity line.
905 	 * @return the parsed identity or null in case the identity could not be
906 	 *         parsed.
907 	 */
908 	public static PersonIdent parsePersonIdent(byte[] raw, int nameB) {
909 		Charset cs;
910 		try {
911 			cs = parseEncoding(raw);
912 		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
913 			// Assume UTF-8 for person identities, usually this is correct.
914 			// If not decode() will fall back to the ISO-8859-1 encoding.
915 			cs = UTF_8;
916 		}
917 
918 		final int emailB = nextLF(raw, nameB, '<');
919 		final int emailE = nextLF(raw, emailB, '>');
920 		if (emailB >= raw.length || raw[emailB] == '\n' ||
921 				(emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
922 			return null;
923 
924 		final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
925 				emailB - 2 : emailB - 1;
926 		final String name = decode(cs, raw, nameB, nameEnd);
927 		final String email = decode(cs, raw, emailB, emailE - 1);
928 
929 		// Start searching from end of line, as after first name-email pair,
930 		// another name-email pair may occur. We will ignore all kinds of
931 		// "junk" following the first email.
932 		//
933 		// We've to use (emailE - 1) for the case that raw[email] is LF,
934 		// otherwise we would run too far. "-2" is necessary to position
935 		// before the LF in case of LF termination resp. the penultimate
936 		// character if there is no trailing LF.
937 		final int tzBegin = lastIndexOfTrim(raw, ' ',
938 				nextLF(raw, emailE - 1) - 2) + 1;
939 		if (tzBegin <= emailE) // No time/zone, still valid
940 			return new PersonIdent(name, email, 0, 0);
941 
942 		final int whenBegin = Math.max(emailE,
943 				lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
944 		if (whenBegin >= tzBegin - 1) // No time/zone, still valid
945 			return new PersonIdent(name, email, 0, 0);
946 
947 		final long when = parseLongBase10(raw, whenBegin, null);
948 		final int tz = parseTimeZoneOffset(raw, tzBegin);
949 		return new PersonIdent(name, email, when * 1000L, tz);
950 	}
951 
952 	/**
953 	 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
954 	 * <p>
955 	 * When passing in a value for <code>nameB</code> callers should use the
956 	 * return value of {@link #author(byte[], int)} or
957 	 * {@link #committer(byte[], int)}, as these methods provide the proper
958 	 * position within the buffer.
959 	 *
960 	 * @param raw
961 	 *            the buffer to parse character data from.
962 	 * @param nameB
963 	 *            first position of the identity information. This should be the
964 	 *            first position after the space which delimits the header field
965 	 *            name (e.g. "author" or "committer") from the rest of the
966 	 *            identity line.
967 	 * @return the parsed identity. Never null.
968 	 */
969 	public static PersonIdent parsePersonIdentOnly(final byte[] raw,
970 			final int nameB) {
971 		int stop = nextLF(raw, nameB);
972 		int emailB = nextLF(raw, nameB, '<');
973 		int emailE = nextLF(raw, emailB, '>');
974 		final String name;
975 		final String email;
976 		if (emailE < stop) {
977 			email = decode(raw, emailB, emailE - 1);
978 		} else {
979 			email = "invalid"; //$NON-NLS-1$
980 		}
981 		if (emailB < stop)
982 			name = decode(raw, nameB, emailB - 2);
983 		else
984 			name = decode(raw, nameB, stop);
985 
986 		final MutableInteger.html#MutableInteger">MutableInteger ptrout = new MutableInteger();
987 		long when;
988 		int tz;
989 		if (emailE < stop) {
990 			when = parseLongBase10(raw, emailE + 1, ptrout);
991 			tz = parseTimeZoneOffset(raw, ptrout.value);
992 		} else {
993 			when = 0;
994 			tz = 0;
995 		}
996 		return new PersonIdent(name, email, when * 1000L, tz);
997 	}
998 
999 	/**
1000 	 * Locate the end of a footer line key string.
1001 	 * <p>
1002 	 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
1003 	 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
1004 	 * the first ':'.
1005 	 * <p>
1006 	 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
1007 	 * then this method returns -1.
1008 	 *
1009 	 * @param raw
1010 	 *            buffer to scan.
1011 	 * @param ptr
1012 	 *            first position within raw to consider as a footer line key.
1013 	 * @return position of the ':' which terminates the footer line key if this
1014 	 *         is otherwise a valid footer line key; otherwise -1.
1015 	 */
1016 	public static int endOfFooterLineKey(byte[] raw, int ptr) {
1017 		try {
1018 			for (;;) {
1019 				final byte c = raw[ptr];
1020 				if (footerLineKeyChars[c] == 0) {
1021 					if (c == ':')
1022 						return ptr;
1023 					return -1;
1024 				}
1025 				ptr++;
1026 			}
1027 		} catch (ArrayIndexOutOfBoundsException e) {
1028 			return -1;
1029 		}
1030 	}
1031 
1032 	/**
1033 	 * Decode a buffer under UTF-8, if possible.
1034 	 *
1035 	 * If the byte stream cannot be decoded that way, the platform default is tried
1036 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1037 	 *
1038 	 * @param buffer
1039 	 *            buffer to pull raw bytes from.
1040 	 * @return a string representation of the range <code>[start,end)</code>,
1041 	 *         after decoding the region through the specified character set.
1042 	 */
1043 	public static String decode(byte[] buffer) {
1044 		return decode(buffer, 0, buffer.length);
1045 	}
1046 
1047 	/**
1048 	 * Decode a buffer under UTF-8, if possible.
1049 	 *
1050 	 * If the byte stream cannot be decoded that way, the platform default is
1051 	 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1052 	 *
1053 	 * @param buffer
1054 	 *            buffer to pull raw bytes from.
1055 	 * @param start
1056 	 *            start position in buffer
1057 	 * @param end
1058 	 *            one position past the last location within the buffer to take
1059 	 *            data from.
1060 	 * @return a string representation of the range <code>[start,end)</code>,
1061 	 *         after decoding the region through the specified character set.
1062 	 */
1063 	public static String decode(final byte[] buffer, final int start,
1064 			final int end) {
1065 		return decode(UTF_8, buffer, start, end);
1066 	}
1067 
1068 	/**
1069 	 * Decode a buffer under the specified character set if possible.
1070 	 *
1071 	 * If the byte stream cannot be decoded that way, the platform default is tried
1072 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1073 	 *
1074 	 * @param cs
1075 	 *            character set to use when decoding the buffer.
1076 	 * @param buffer
1077 	 *            buffer to pull raw bytes from.
1078 	 * @return a string representation of the range <code>[start,end)</code>,
1079 	 *         after decoding the region through the specified character set.
1080 	 */
1081 	public static String decode(Charset cs, byte[] buffer) {
1082 		return decode(cs, buffer, 0, buffer.length);
1083 	}
1084 
1085 	/**
1086 	 * Decode a region of the buffer under the specified character set if possible.
1087 	 *
1088 	 * If the byte stream cannot be decoded that way, the platform default is tried
1089 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1090 	 *
1091 	 * @param cs
1092 	 *            character set to use when decoding the buffer.
1093 	 * @param buffer
1094 	 *            buffer to pull raw bytes from.
1095 	 * @param start
1096 	 *            first position within the buffer to take data from.
1097 	 * @param end
1098 	 *            one position past the last location within the buffer to take
1099 	 *            data from.
1100 	 * @return a string representation of the range <code>[start,end)</code>,
1101 	 *         after decoding the region through the specified character set.
1102 	 */
1103 	public static String decode(final Charset cs, final byte[] buffer,
1104 			final int start, final int end) {
1105 		try {
1106 			return decodeNoFallback(cs, buffer, start, end);
1107 		} catch (CharacterCodingException e) {
1108 			// Fall back to an ISO-8859-1 style encoding. At least all of
1109 			// the bytes will be present in the output.
1110 			//
1111 			return extractBinaryString(buffer, start, end);
1112 		}
1113 	}
1114 
1115 	/**
1116 	 * Decode a region of the buffer under the specified character set if
1117 	 * possible.
1118 	 *
1119 	 * If the byte stream cannot be decoded that way, the platform default is
1120 	 * tried and if that too fails, an exception is thrown.
1121 	 *
1122 	 * @param cs
1123 	 *            character set to use when decoding the buffer.
1124 	 * @param buffer
1125 	 *            buffer to pull raw bytes from.
1126 	 * @param start
1127 	 *            first position within the buffer to take data from.
1128 	 * @param end
1129 	 *            one position past the last location within the buffer to take
1130 	 *            data from.
1131 	 * @return a string representation of the range <code>[start,end)</code>,
1132 	 *         after decoding the region through the specified character set.
1133 	 * @throws java.nio.charset.CharacterCodingException
1134 	 *             the input is not in any of the tested character sets.
1135 	 */
1136 	public static String decodeNoFallback(final Charset cs,
1137 			final byte[] buffer, final int start, final int end)
1138 			throws CharacterCodingException {
1139 		ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
1140 		b.mark();
1141 
1142 		// Try our built-in favorite. The assumption here is that
1143 		// decoding will fail if the data is not actually encoded
1144 		// using that encoder.
1145 		try {
1146 			return decode(b, UTF_8);
1147 		} catch (CharacterCodingException e) {
1148 			b.reset();
1149 		}
1150 
1151 		if (!cs.equals(UTF_8)) {
1152 			// Try the suggested encoding, it might be right since it was
1153 			// provided by the caller.
1154 			try {
1155 				return decode(b, cs);
1156 			} catch (CharacterCodingException e) {
1157 				b.reset();
1158 			}
1159 		}
1160 
1161 		// Try the default character set. A small group of people
1162 		// might actually use the same (or very similar) locale.
1163 		Charset defcs = Charset.defaultCharset();
1164 		if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
1165 			try {
1166 				return decode(b, defcs);
1167 			} catch (CharacterCodingException e) {
1168 				b.reset();
1169 			}
1170 		}
1171 
1172 		throw new CharacterCodingException();
1173 	}
1174 
1175 	/**
1176 	 * Decode a region of the buffer under the ISO-8859-1 encoding.
1177 	 *
1178 	 * Each byte is treated as a single character in the 8859-1 character
1179 	 * encoding, performing a raw binary-&gt;char conversion.
1180 	 *
1181 	 * @param buffer
1182 	 *            buffer to pull raw bytes from.
1183 	 * @param start
1184 	 *            first position within the buffer to take data from.
1185 	 * @param end
1186 	 *            one position past the last location within the buffer to take
1187 	 *            data from.
1188 	 * @return a string representation of the range <code>[start,end)</code>.
1189 	 */
1190 	public static String extractBinaryString(final byte[] buffer,
1191 			final int start, final int end) {
1192 		final StringBuilder r = new StringBuilder(end - start);
1193 		for (int i = start; i < end; i++)
1194 			r.append((char) (buffer[i] & 0xff));
1195 		return r.toString();
1196 	}
1197 
1198 	private static String decode(ByteBuffer b, Charset charset)
1199 			throws CharacterCodingException {
1200 		final CharsetDecoder d = charset.newDecoder();
1201 		d.onMalformedInput(CodingErrorAction.REPORT);
1202 		d.onUnmappableCharacter(CodingErrorAction.REPORT);
1203 		return d.decode(b).toString();
1204 	}
1205 
1206 	/**
1207 	 * Locate the position of the commit message body.
1208 	 *
1209 	 * @param b
1210 	 *            buffer to scan.
1211 	 * @param ptr
1212 	 *            position in buffer to start the scan at. Most callers should
1213 	 *            pass 0 to ensure the scan starts from the beginning of the
1214 	 *            commit buffer.
1215 	 * @return position of the user's message buffer.
1216 	 */
1217 	public static final int commitMessage(byte[] b, int ptr) {
1218 		final int sz = b.length;
1219 		if (ptr == 0)
1220 			ptr += 46; // skip the "tree ..." line.
1221 		while (ptr < sz && b[ptr] == 'p')
1222 			ptr += 48; // skip this parent.
1223 
1224 		// Skip any remaining header lines, ignoring what their actual
1225 		// header line type is. This is identical to the logic for a tag.
1226 		//
1227 		return tagMessage(b, ptr);
1228 	}
1229 
1230 	/**
1231 	 * Locate the position of the tag message body.
1232 	 *
1233 	 * @param b
1234 	 *            buffer to scan.
1235 	 * @param ptr
1236 	 *            position in buffer to start the scan at. Most callers should
1237 	 *            pass 0 to ensure the scan starts from the beginning of the tag
1238 	 *            buffer.
1239 	 * @return position of the user's message buffer.
1240 	 */
1241 	public static final int tagMessage(byte[] b, int ptr) {
1242 		final int sz = b.length;
1243 		if (ptr == 0)
1244 			ptr += 48; // skip the "object ..." line.
1245 		while (ptr < sz && b[ptr] != '\n')
1246 			ptr = nextLF(b, ptr);
1247 		if (ptr < sz && b[ptr] == '\n')
1248 			return ptr + 1;
1249 		return -1;
1250 	}
1251 
1252 	/**
1253 	 * Locate the end of a paragraph.
1254 	 * <p>
1255 	 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1256 	 *
1257 	 * @param b
1258 	 *            buffer to scan.
1259 	 * @param start
1260 	 *            position in buffer to start the scan at. Most callers will
1261 	 *            want to pass the first position of the commit message (as
1262 	 *            found by {@link #commitMessage(byte[], int)}.
1263 	 * @return position of the LF at the end of the paragraph;
1264 	 *         <code>b.length</code> if no paragraph end could be located.
1265 	 */
1266 	public static final int endOfParagraph(byte[] b, int start) {
1267 		int ptr = start;
1268 		final int sz = b.length;
1269 		while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1270 			ptr = nextLF(b, ptr);
1271 		if (ptr > start && b[ptr - 1] == '\n')
1272 			ptr--;
1273 		if (ptr > start && b[ptr - 1] == '\r')
1274 			ptr--;
1275 		return ptr;
1276 	}
1277 
1278 	/**
1279 	 * Get last index of {@code ch} in raw, trimming spaces.
1280 	 *
1281 	 * @param raw
1282 	 *            buffer to scan.
1283 	 * @param ch
1284 	 *            character to find.
1285 	 * @param pos
1286 	 *            starting position.
1287 	 * @return last index of {@code ch} in raw, trimming spaces.
1288 	 * @since 4.1
1289 	 */
1290 	public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1291 		while (pos >= 0 && raw[pos] == ' ')
1292 			pos--;
1293 
1294 		while (pos >= 0 && raw[pos] != ch)
1295 			pos--;
1296 
1297 		return pos;
1298 	}
1299 
1300 	private static Charset charsetForAlias(String name) {
1301 		return encodingAliases.get(StringUtils.toLowerCase(name));
1302 	}
1303 
1304 	private RawParseUtils() {
1305 		// Don't create instances of a static only utility.
1306 	}
1307 }