View Javadoc
1   /*
2    * Copyright (C) 2008-2009, Google Inc.
3    * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
4    * and other copyright owners as documented in the project's IP log.
5    *
6    * This program and the accompanying materials are made available
7    * under the terms of the Eclipse Distribution License v1.0 which
8    * accompanies this distribution, is reproduced below, and is
9    * available at http://www.eclipse.org/org/documents/edl-v10.php
10   *
11   * All rights reserved.
12   *
13   * Redistribution and use in source and binary forms, with or
14   * without modification, are permitted provided that the following
15   * conditions are met:
16   *
17   * - Redistributions of source code must retain the above copyright
18   *   notice, this list of conditions and the following disclaimer.
19   *
20   * - Redistributions in binary form must reproduce the above
21   *   copyright notice, this list of conditions and the following
22   *   disclaimer in the documentation and/or other materials provided
23   *   with the distribution.
24   *
25   * - Neither the name of the Eclipse Foundation, Inc. nor the
26   *   names of its contributors may be used to endorse or promote
27   *   products derived from this software without specific prior
28   *   written permission.
29   *
30   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31   * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
32   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
35   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
40   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43   */
44  
45  package org.eclipse.jgit.util;
46  
47  import static java.nio.charset.StandardCharsets.ISO_8859_1;
48  import static java.nio.charset.StandardCharsets.UTF_8;
49  import static org.eclipse.jgit.lib.ObjectChecker.author;
50  import static org.eclipse.jgit.lib.ObjectChecker.committer;
51  import static org.eclipse.jgit.lib.ObjectChecker.encoding;
52  import static org.eclipse.jgit.lib.ObjectChecker.tagger;
53  
54  import java.nio.ByteBuffer;
55  import java.nio.charset.CharacterCodingException;
56  import java.nio.charset.Charset;
57  import java.nio.charset.CharsetDecoder;
58  import java.nio.charset.CodingErrorAction;
59  import java.nio.charset.IllegalCharsetNameException;
60  import java.nio.charset.UnsupportedCharsetException;
61  import java.util.Arrays;
62  import java.util.HashMap;
63  import java.util.Map;
64  
65  import org.eclipse.jgit.annotations.Nullable;
66  import org.eclipse.jgit.lib.Constants;
67  import org.eclipse.jgit.lib.PersonIdent;
68  
69  /**
70   * Handy utility functions to parse raw object contents.
71   */
72  public final class RawParseUtils {
73  	/**
74  	 * UTF-8 charset constant.
75  	 *
76  	 * @since 2.2
77  	 */
78  	public static final Charset UTF8_CHARSET = UTF_8;
79  
80  	private static final byte[] digits10;
81  
82  	private static final byte[] digits16;
83  
84  	private static final byte[] footerLineKeyChars;
85  
86  	private static final Map<String, Charset> encodingAliases;
87  
88  	static {
89  		encodingAliases = new HashMap<>();
90  		encodingAliases.put("latin-1", ISO_8859_1); //$NON-NLS-1$
91  		encodingAliases.put("iso-latin-1", ISO_8859_1); //$NON-NLS-1$
92  
93  		digits10 = new byte['9' + 1];
94  		Arrays.fill(digits10, (byte) -1);
95  		for (char i = '0'; i <= '9'; i++)
96  			digits10[i] = (byte) (i - '0');
97  
98  		digits16 = new byte['f' + 1];
99  		Arrays.fill(digits16, (byte) -1);
100 		for (char i = '0'; i <= '9'; i++)
101 			digits16[i] = (byte) (i - '0');
102 		for (char i = 'a'; i <= 'f'; i++)
103 			digits16[i] = (byte) ((i - 'a') + 10);
104 		for (char i = 'A'; i <= 'F'; i++)
105 			digits16[i] = (byte) ((i - 'A') + 10);
106 
107 		footerLineKeyChars = new byte['z' + 1];
108 		footerLineKeyChars['-'] = 1;
109 		for (char i = '0'; i <= '9'; i++)
110 			footerLineKeyChars[i] = 1;
111 		for (char i = 'A'; i <= 'Z'; i++)
112 			footerLineKeyChars[i] = 1;
113 		for (char i = 'a'; i <= 'z'; i++)
114 			footerLineKeyChars[i] = 1;
115 	}
116 
117 	/**
118 	 * Determine if b[ptr] matches src.
119 	 *
120 	 * @param b
121 	 *            the buffer to scan.
122 	 * @param ptr
123 	 *            first position within b, this should match src[0].
124 	 * @param src
125 	 *            the buffer to test for equality with b.
126 	 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
127 	 */
128 	public static final int match(final byte[] b, int ptr, final byte[] src) {
129 		if (ptr + src.length > b.length)
130 			return -1;
131 		for (int i = 0; i < src.length; i++, ptr++)
132 			if (b[ptr] != src[i])
133 				return -1;
134 		return ptr;
135 	}
136 
137 	private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
138 			'6', '7', '8', '9' };
139 
140 	/**
141 	 * Format a base 10 numeric into a temporary buffer.
142 	 * <p>
143 	 * Formatting is performed backwards. The method starts at offset
144 	 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
145 	 * <code>digits</code> is the number of positions necessary to store the
146 	 * base 10 value.
147 	 * <p>
148 	 * The argument and return values from this method make it easy to chain
149 	 * writing, for example:
150 	 * </p>
151 	 *
152 	 * <pre>
153 	 * final byte[] tmp = new byte[64];
154 	 * int ptr = tmp.length;
155 	 * tmp[--ptr] = '\n';
156 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
157 	 * tmp[--ptr] = ' ';
158 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
159 	 * tmp[--ptr] = 0;
160 	 * final String str = new String(tmp, ptr, tmp.length - ptr);
161 	 * </pre>
162 	 *
163 	 * @param b
164 	 *            buffer to write into.
165 	 * @param o
166 	 *            one offset past the location where writing will begin; writing
167 	 *            proceeds towards lower index values.
168 	 * @param value
169 	 *            the value to store.
170 	 * @return the new offset value <code>o</code>. This is the position of
171 	 *         the last byte written. Additional writing should start at one
172 	 *         position earlier.
173 	 */
174 	public static int formatBase10(final byte[] b, int o, int value) {
175 		if (value == 0) {
176 			b[--o] = '0';
177 			return o;
178 		}
179 		final boolean isneg = value < 0;
180 		if (isneg)
181 			value = -value;
182 		while (value != 0) {
183 			b[--o] = base10byte[value % 10];
184 			value /= 10;
185 		}
186 		if (isneg)
187 			b[--o] = '-';
188 		return o;
189 	}
190 
191 	/**
192 	 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
193 	 * <p>
194 	 * Digit sequences can begin with an optional run of spaces before the
195 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
196 	 * Any other characters will cause the method to stop and return the current
197 	 * result to the caller.
198 	 *
199 	 * @param b
200 	 *            buffer to scan.
201 	 * @param ptr
202 	 *            position within buffer to start parsing digits at.
203 	 * @param ptrResult
204 	 *            optional location to return the new ptr value through. If null
205 	 *            the ptr value will be discarded.
206 	 * @return the value at this location; 0 if the location is not a valid
207 	 *         numeric.
208 	 */
209 	public static final int parseBase10(final byte[] b, int ptr,
210 			final MutableInteger ptrResult) {
211 		int r = 0;
212 		int sign = 0;
213 		try {
214 			final int sz = b.length;
215 			while (ptr < sz && b[ptr] == ' ')
216 				ptr++;
217 			if (ptr >= sz)
218 				return 0;
219 
220 			switch (b[ptr]) {
221 			case '-':
222 				sign = -1;
223 				ptr++;
224 				break;
225 			case '+':
226 				ptr++;
227 				break;
228 			}
229 
230 			while (ptr < sz) {
231 				final byte v = digits10[b[ptr]];
232 				if (v < 0)
233 					break;
234 				r = (r * 10) + v;
235 				ptr++;
236 			}
237 		} catch (ArrayIndexOutOfBoundsException e) {
238 			// Not a valid digit.
239 		}
240 		if (ptrResult != null)
241 			ptrResult.value = ptr;
242 		return sign < 0 ? -r : r;
243 	}
244 
245 	/**
246 	 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
247 	 * <p>
248 	 * Digit sequences can begin with an optional run of spaces before the
249 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
250 	 * Any other characters will cause the method to stop and return the current
251 	 * result to the caller.
252 	 *
253 	 * @param b
254 	 *            buffer to scan.
255 	 * @param ptr
256 	 *            position within buffer to start parsing digits at.
257 	 * @param ptrResult
258 	 *            optional location to return the new ptr value through. If null
259 	 *            the ptr value will be discarded.
260 	 * @return the value at this location; 0 if the location is not a valid
261 	 *         numeric.
262 	 */
263 	public static final long parseLongBase10(final byte[] b, int ptr,
264 			final MutableInteger ptrResult) {
265 		long r = 0;
266 		int sign = 0;
267 		try {
268 			final int sz = b.length;
269 			while (ptr < sz && b[ptr] == ' ')
270 				ptr++;
271 			if (ptr >= sz)
272 				return 0;
273 
274 			switch (b[ptr]) {
275 			case '-':
276 				sign = -1;
277 				ptr++;
278 				break;
279 			case '+':
280 				ptr++;
281 				break;
282 			}
283 
284 			while (ptr < sz) {
285 				final byte v = digits10[b[ptr]];
286 				if (v < 0)
287 					break;
288 				r = (r * 10) + v;
289 				ptr++;
290 			}
291 		} catch (ArrayIndexOutOfBoundsException e) {
292 			// Not a valid digit.
293 		}
294 		if (ptrResult != null)
295 			ptrResult.value = ptr;
296 		return sign < 0 ? -r : r;
297 	}
298 
299 	/**
300 	 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
301 	 * <p>
302 	 * The number is read in network byte order, that is, most significant
303 	 * nybble first.
304 	 *
305 	 * @param bs
306 	 *            buffer to parse digits from; positions {@code [p, p+4)} will
307 	 *            be parsed.
308 	 * @param p
309 	 *            first position within the buffer to parse.
310 	 * @return the integer value.
311 	 * @throws java.lang.ArrayIndexOutOfBoundsException
312 	 *             if the string is not hex formatted.
313 	 */
314 	public static final int parseHexInt16(final byte[] bs, final int p) {
315 		int r = digits16[bs[p]] << 4;
316 
317 		r |= digits16[bs[p + 1]];
318 		r <<= 4;
319 
320 		r |= digits16[bs[p + 2]];
321 		r <<= 4;
322 
323 		r |= digits16[bs[p + 3]];
324 		if (r < 0)
325 			throw new ArrayIndexOutOfBoundsException();
326 		return r;
327 	}
328 
329 	/**
330 	 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
331 	 * <p>
332 	 * The number is read in network byte order, that is, most significant
333 	 * nybble first.
334 	 *
335 	 * @param bs
336 	 *            buffer to parse digits from; positions {@code [p, p+8)} will
337 	 *            be parsed.
338 	 * @param p
339 	 *            first position within the buffer to parse.
340 	 * @return the integer value.
341 	 * @throws java.lang.ArrayIndexOutOfBoundsException
342 	 *             if the string is not hex formatted.
343 	 */
344 	public static final int parseHexInt32(final byte[] bs, final int p) {
345 		int r = digits16[bs[p]] << 4;
346 
347 		r |= digits16[bs[p + 1]];
348 		r <<= 4;
349 
350 		r |= digits16[bs[p + 2]];
351 		r <<= 4;
352 
353 		r |= digits16[bs[p + 3]];
354 		r <<= 4;
355 
356 		r |= digits16[bs[p + 4]];
357 		r <<= 4;
358 
359 		r |= digits16[bs[p + 5]];
360 		r <<= 4;
361 
362 		r |= digits16[bs[p + 6]];
363 
364 		final int last = digits16[bs[p + 7]];
365 		if (r < 0 || last < 0)
366 			throw new ArrayIndexOutOfBoundsException();
367 		return (r << 4) | last;
368 	}
369 
370 	/**
371 	 * Parse 16 character base 16 (hex) formatted string to unsigned long.
372 	 * <p>
373 	 * The number is read in network byte order, that is, most significant
374 	 * nibble first.
375 	 *
376 	 * @param bs
377 	 *            buffer to parse digits from; positions {@code [p, p+16)} will
378 	 *            be parsed.
379 	 * @param p
380 	 *            first position within the buffer to parse.
381 	 * @return the integer value.
382 	 * @throws java.lang.ArrayIndexOutOfBoundsException
383 	 *             if the string is not hex formatted.
384 	 * @since 4.3
385 	 */
386 	public static final long parseHexInt64(final byte[] bs, final int p) {
387 		long r = digits16[bs[p]] << 4;
388 
389 		r |= digits16[bs[p + 1]];
390 		r <<= 4;
391 
392 		r |= digits16[bs[p + 2]];
393 		r <<= 4;
394 
395 		r |= digits16[bs[p + 3]];
396 		r <<= 4;
397 
398 		r |= digits16[bs[p + 4]];
399 		r <<= 4;
400 
401 		r |= digits16[bs[p + 5]];
402 		r <<= 4;
403 
404 		r |= digits16[bs[p + 6]];
405 		r <<= 4;
406 
407 		r |= digits16[bs[p + 7]];
408 		r <<= 4;
409 
410 		r |= digits16[bs[p + 8]];
411 		r <<= 4;
412 
413 		r |= digits16[bs[p + 9]];
414 		r <<= 4;
415 
416 		r |= digits16[bs[p + 10]];
417 		r <<= 4;
418 
419 		r |= digits16[bs[p + 11]];
420 		r <<= 4;
421 
422 		r |= digits16[bs[p + 12]];
423 		r <<= 4;
424 
425 		r |= digits16[bs[p + 13]];
426 		r <<= 4;
427 
428 		r |= digits16[bs[p + 14]];
429 
430 		final int last = digits16[bs[p + 15]];
431 		if (r < 0 || last < 0)
432 			throw new ArrayIndexOutOfBoundsException();
433 		return (r << 4) | last;
434 	}
435 
436 	/**
437 	 * Parse a single hex digit to its numeric value (0-15).
438 	 *
439 	 * @param digit
440 	 *            hex character to parse.
441 	 * @return numeric value, in the range 0-15.
442 	 * @throws java.lang.ArrayIndexOutOfBoundsException
443 	 *             if the input digit is not a valid hex digit.
444 	 */
445 	public static final int parseHexInt4(final byte digit) {
446 		final byte r = digits16[digit];
447 		if (r < 0)
448 			throw new ArrayIndexOutOfBoundsException();
449 		return r;
450 	}
451 
452 	/**
453 	 * Parse a Git style timezone string.
454 	 * <p>
455 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
456 	 * lower two positions count minutes, not 100ths of an hour.
457 	 *
458 	 * @param b
459 	 *            buffer to scan.
460 	 * @param ptr
461 	 *            position within buffer to start parsing digits at.
462 	 * @return the timezone at this location, expressed in minutes.
463 	 */
464 	public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
465 		return parseTimeZoneOffset(b, ptr, null);
466 	}
467 
468 	/**
469 	 * Parse a Git style timezone string.
470 	 * <p>
471 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
472 	 * lower two positions count minutes, not 100ths of an hour.
473 	 *
474 	 * @param b
475 	 *            buffer to scan.
476 	 * @param ptr
477 	 *            position within buffer to start parsing digits at.
478 	 * @param ptrResult
479 	 *            optional location to return the new ptr value through. If null
480 	 *            the ptr value will be discarded.
481 	 * @return the timezone at this location, expressed in minutes.
482 	 * @since 4.1
483 	 */
484 	public static final int parseTimeZoneOffset(final byte[] b, int ptr,
485 			MutableInteger ptrResult) {
486 		final int v = parseBase10(b, ptr, ptrResult);
487 		final int tzMins = v % 100;
488 		final int tzHours = v / 100;
489 		return tzHours * 60 + tzMins;
490 	}
491 
492 	/**
493 	 * Locate the first position after a given character.
494 	 *
495 	 * @param b
496 	 *            buffer to scan.
497 	 * @param ptr
498 	 *            position within buffer to start looking for chrA at.
499 	 * @param chrA
500 	 *            character to find.
501 	 * @return new position just after chrA.
502 	 */
503 	public static final int next(final byte[] b, int ptr, final char chrA) {
504 		final int sz = b.length;
505 		while (ptr < sz) {
506 			if (b[ptr++] == chrA)
507 				return ptr;
508 		}
509 		return ptr;
510 	}
511 
512 	/**
513 	 * Locate the first position after the next LF.
514 	 * <p>
515 	 * This method stops on the first '\n' it finds.
516 	 *
517 	 * @param b
518 	 *            buffer to scan.
519 	 * @param ptr
520 	 *            position within buffer to start looking for LF at.
521 	 * @return new position just after the first LF found.
522 	 */
523 	public static final int nextLF(final byte[] b, int ptr) {
524 		return next(b, ptr, '\n');
525 	}
526 
527 	/**
528 	 * Locate the first position after either the given character or LF.
529 	 * <p>
530 	 * This method stops on the first match it finds from either chrA or '\n'.
531 	 *
532 	 * @param b
533 	 *            buffer to scan.
534 	 * @param ptr
535 	 *            position within buffer to start looking for chrA or LF at.
536 	 * @param chrA
537 	 *            character to find.
538 	 * @return new position just after the first chrA or LF to be found.
539 	 */
540 	public static final int nextLF(final byte[] b, int ptr, final char chrA) {
541 		final int sz = b.length;
542 		while (ptr < sz) {
543 			final byte c = b[ptr++];
544 			if (c == chrA || c == '\n')
545 				return ptr;
546 		}
547 		return ptr;
548 	}
549 
550 	/**
551 	 * Locate the first position before a given character.
552 	 *
553 	 * @param b
554 	 *            buffer to scan.
555 	 * @param ptr
556 	 *            position within buffer to start looking for chrA at.
557 	 * @param chrA
558 	 *            character to find.
559 	 * @return new position just before chrA, -1 for not found
560 	 */
561 	public static final int prev(final byte[] b, int ptr, final char chrA) {
562 		if (ptr == b.length)
563 			--ptr;
564 		while (ptr >= 0) {
565 			if (b[ptr--] == chrA)
566 				return ptr;
567 		}
568 		return ptr;
569 	}
570 
571 	/**
572 	 * Locate the first position before the previous LF.
573 	 * <p>
574 	 * This method stops on the first '\n' it finds.
575 	 *
576 	 * @param b
577 	 *            buffer to scan.
578 	 * @param ptr
579 	 *            position within buffer to start looking for LF at.
580 	 * @return new position just before the first LF found, -1 for not found
581 	 */
582 	public static final int prevLF(final byte[] b, int ptr) {
583 		return prev(b, ptr, '\n');
584 	}
585 
586 	/**
587 	 * Locate the previous position before either the given character or LF.
588 	 * <p>
589 	 * This method stops on the first match it finds from either chrA or '\n'.
590 	 *
591 	 * @param b
592 	 *            buffer to scan.
593 	 * @param ptr
594 	 *            position within buffer to start looking for chrA or LF at.
595 	 * @param chrA
596 	 *            character to find.
597 	 * @return new position just before the first chrA or LF to be found, -1 for
598 	 *         not found
599 	 */
600 	public static final int prevLF(final byte[] b, int ptr, final char chrA) {
601 		if (ptr == b.length)
602 			--ptr;
603 		while (ptr >= 0) {
604 			final byte c = b[ptr--];
605 			if (c == chrA || c == '\n')
606 				return ptr;
607 		}
608 		return ptr;
609 	}
610 
611 	/**
612 	 * Index the region between <code>[ptr, end)</code> to find line starts.
613 	 * <p>
614 	 * The returned list is 1 indexed. Index 0 contains
615 	 * {@link java.lang.Integer#MIN_VALUE} to pad the list out.
616 	 * <p>
617 	 * Using a 1 indexed list means that line numbers can be directly accessed
618 	 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
619 	 * <code>ptr</code>.
620 	 * <p>
621 	 * The last element (index <code>map.size()-1</code>) always contains
622 	 * <code>end</code>.
623 	 * <p>
624 	 * If the data contains a '\0' anywhere, the whole region is considered
625 	 * binary and a LineMap corresponding to a single line is returned.
626 	 * </p>
627 	 *
628 	 * @param buf
629 	 *            buffer to scan.
630 	 * @param ptr
631 	 *            position within the buffer corresponding to the first byte of
632 	 *            line 1.
633 	 * @param end
634 	 *            1 past the end of the content within <code>buf</code>.
635 	 * @return a line map indexing the start position of each line.
636 	 */
637 	public static final IntList lineMap(final byte[] buf, int ptr, int end) {
638 		int start = ptr;
639 
640 		// Experimentally derived from multiple source repositories
641 		// the average number of bytes/line is 36. Its a rough guess
642 		// to initially size our map close to the target.
643 		IntList map = new IntList((end - ptr) / 36);
644 		map.add(Integer.MIN_VALUE);
645 		boolean foundLF = true;
646 		for (; ptr < end; ptr++) {
647 			if (foundLF) {
648 				map.add(ptr);
649 			}
650 
651 			if (buf[ptr] == '\0') {
652 				// binary data.
653 				map = new IntList(3);
654 				map.add(Integer.MIN_VALUE);
655 				map.add(start);
656 				break;
657 			}
658 
659 			foundLF = (buf[ptr] == '\n');
660 		}
661 		map.add(end);
662 		return map;
663 	}
664 
665 	/**
666 	 * Locate the "author " header line data.
667 	 *
668 	 * @param b
669 	 *            buffer to scan.
670 	 * @param ptr
671 	 *            position in buffer to start the scan at. Most callers should
672 	 *            pass 0 to ensure the scan starts from the beginning of the
673 	 *            commit buffer and does not accidentally look at message body.
674 	 * @return position just after the space in "author ", so the first
675 	 *         character of the author's name. If no author header can be
676 	 *         located -1 is returned.
677 	 */
678 	public static final int author(final byte[] b, int ptr) {
679 		final int sz = b.length;
680 		if (ptr == 0)
681 			ptr += 46; // skip the "tree ..." line.
682 		while (ptr < sz && b[ptr] == 'p')
683 			ptr += 48; // skip this parent.
684 		return match(b, ptr, author);
685 	}
686 
687 	/**
688 	 * Locate the "committer " header line data.
689 	 *
690 	 * @param b
691 	 *            buffer to scan.
692 	 * @param ptr
693 	 *            position in buffer to start the scan at. Most callers should
694 	 *            pass 0 to ensure the scan starts from the beginning of the
695 	 *            commit buffer and does not accidentally look at message body.
696 	 * @return position just after the space in "committer ", so the first
697 	 *         character of the committer's name. If no committer header can be
698 	 *         located -1 is returned.
699 	 */
700 	public static final int committer(final byte[] b, int ptr) {
701 		final int sz = b.length;
702 		if (ptr == 0)
703 			ptr += 46; // skip the "tree ..." line.
704 		while (ptr < sz && b[ptr] == 'p')
705 			ptr += 48; // skip this parent.
706 		if (ptr < sz && b[ptr] == 'a')
707 			ptr = nextLF(b, ptr);
708 		return match(b, ptr, committer);
709 	}
710 
711 	/**
712 	 * Locate the "tagger " header line data.
713 	 *
714 	 * @param b
715 	 *            buffer to scan.
716 	 * @param ptr
717 	 *            position in buffer to start the scan at. Most callers should
718 	 *            pass 0 to ensure the scan starts from the beginning of the tag
719 	 *            buffer and does not accidentally look at message body.
720 	 * @return position just after the space in "tagger ", so the first
721 	 *         character of the tagger's name. If no tagger header can be
722 	 *         located -1 is returned.
723 	 */
724 	public static final int tagger(final byte[] b, int ptr) {
725 		final int sz = b.length;
726 		if (ptr == 0)
727 			ptr += 48; // skip the "object ..." line.
728 		while (ptr < sz) {
729 			if (b[ptr] == '\n')
730 				return -1;
731 			final int m = match(b, ptr, tagger);
732 			if (m >= 0)
733 				return m;
734 			ptr = nextLF(b, ptr);
735 		}
736 		return -1;
737 	}
738 
739 	/**
740 	 * Locate the "encoding " header line.
741 	 *
742 	 * @param b
743 	 *            buffer to scan.
744 	 * @param ptr
745 	 *            position in buffer to start the scan at. Most callers should
746 	 *            pass 0 to ensure the scan starts from the beginning of the
747 	 *            buffer and does not accidentally look at the message body.
748 	 * @return position just after the space in "encoding ", so the first
749 	 *         character of the encoding's name. If no encoding header can be
750 	 *         located -1 is returned (and UTF-8 should be assumed).
751 	 */
752 	public static final int encoding(final byte[] b, int ptr) {
753 		final int sz = b.length;
754 		while (ptr < sz) {
755 			if (b[ptr] == '\n')
756 				return -1;
757 			if (b[ptr] == 'e')
758 				break;
759 			ptr = nextLF(b, ptr);
760 		}
761 		return match(b, ptr, encoding);
762 	}
763 
764 	/**
765 	 * Parse the "encoding " header as a string.
766 	 * <p>
767 	 * Locates the "encoding " header (if present) and returns its value.
768 	 *
769 	 * @param b
770 	 *            buffer to scan.
771 	 * @return the encoding header as specified in the commit; null if the
772 	 *         header was not present and should be assumed.
773 	 * @since 4.2
774 	 */
775 	@Nullable
776 	public static String parseEncodingName(final byte[] b) {
777 		int enc = encoding(b, 0);
778 		if (enc < 0) {
779 			return null;
780 		}
781 		int lf = nextLF(b, enc);
782 		return decode(UTF_8, b, enc, lf - 1);
783 	}
784 
785 	/**
786 	 * Parse the "encoding " header into a character set reference.
787 	 * <p>
788 	 * Locates the "encoding " header (if present) by first calling
789 	 * {@link #encoding(byte[], int)} and then returns the proper character set
790 	 * to apply to this buffer to evaluate its contents as character data.
791 	 * <p>
792 	 * If no encoding header is present {@code UTF-8} is assumed.
793 	 *
794 	 * @param b
795 	 *            buffer to scan.
796 	 * @return the Java character set representation. Never null.
797 	 * @throws IllegalCharsetNameException
798 	 *             if the character set requested by the encoding header is
799 	 *             malformed and unsupportable.
800 	 * @throws UnsupportedCharsetException
801 	 *             if the JRE does not support the character set requested by
802 	 *             the encoding header.
803 	 */
804 	public static Charset parseEncoding(final byte[] b) {
805 		String enc = parseEncodingName(b);
806 		if (enc == null) {
807 			return UTF_8;
808 		}
809 
810 		String name = enc.trim();
811 		try {
812 			return Charset.forName(name);
813 		} catch (IllegalCharsetNameException
814 				| UnsupportedCharsetException badName) {
815 			Charset aliased = charsetForAlias(name);
816 			if (aliased != null) {
817 				return aliased;
818 			}
819 			throw badName;
820 		}
821 	}
822 
823 	/**
824 	 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
825 	 * <p>
826 	 * Leading spaces won't be trimmed from the string, i.e. will show up in the
827 	 * parsed name afterwards.
828 	 *
829 	 * @param in
830 	 *            the string to parse a name from.
831 	 * @return the parsed identity or null in case the identity could not be
832 	 *         parsed.
833 	 */
834 	public static PersonIdent parsePersonIdent(final String in) {
835 		return parsePersonIdent(Constants.encode(in), 0);
836 	}
837 
838 	/**
839 	 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
840 	 * <p>
841 	 * When passing in a value for <code>nameB</code> callers should use the
842 	 * return value of {@link #author(byte[], int)} or
843 	 * {@link #committer(byte[], int)}, as these methods provide the proper
844 	 * position within the buffer.
845 	 *
846 	 * @param raw
847 	 *            the buffer to parse character data from.
848 	 * @param nameB
849 	 *            first position of the identity information. This should be the
850 	 *            first position after the space which delimits the header field
851 	 *            name (e.g. "author" or "committer") from the rest of the
852 	 *            identity line.
853 	 * @return the parsed identity or null in case the identity could not be
854 	 *         parsed.
855 	 */
856 	public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
857 		Charset cs;
858 		try {
859 			cs = parseEncoding(raw);
860 		} catch (IllegalCharsetNameException | UnsupportedCharsetException e) {
861 			// Assume UTF-8 for person identities, usually this is correct.
862 			// If not decode() will fall back to the ISO-8859-1 encoding.
863 			cs = UTF_8;
864 		}
865 
866 		final int emailB = nextLF(raw, nameB, '<');
867 		final int emailE = nextLF(raw, emailB, '>');
868 		if (emailB >= raw.length || raw[emailB] == '\n' ||
869 				(emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
870 			return null;
871 
872 		final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
873 				emailB - 2 : emailB - 1;
874 		final String name = decode(cs, raw, nameB, nameEnd);
875 		final String email = decode(cs, raw, emailB, emailE - 1);
876 
877 		// Start searching from end of line, as after first name-email pair,
878 		// another name-email pair may occur. We will ignore all kinds of
879 		// "junk" following the first email.
880 		//
881 		// We've to use (emailE - 1) for the case that raw[email] is LF,
882 		// otherwise we would run too far. "-2" is necessary to position
883 		// before the LF in case of LF termination resp. the penultimate
884 		// character if there is no trailing LF.
885 		final int tzBegin = lastIndexOfTrim(raw, ' ',
886 				nextLF(raw, emailE - 1) - 2) + 1;
887 		if (tzBegin <= emailE) // No time/zone, still valid
888 			return new PersonIdent(name, email, 0, 0);
889 
890 		final int whenBegin = Math.max(emailE,
891 				lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
892 		if (whenBegin >= tzBegin - 1) // No time/zone, still valid
893 			return new PersonIdent(name, email, 0, 0);
894 
895 		final long when = parseLongBase10(raw, whenBegin, null);
896 		final int tz = parseTimeZoneOffset(raw, tzBegin);
897 		return new PersonIdent(name, email, when * 1000L, tz);
898 	}
899 
900 	/**
901 	 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
902 	 * <p>
903 	 * When passing in a value for <code>nameB</code> callers should use the
904 	 * return value of {@link #author(byte[], int)} or
905 	 * {@link #committer(byte[], int)}, as these methods provide the proper
906 	 * position within the buffer.
907 	 *
908 	 * @param raw
909 	 *            the buffer to parse character data from.
910 	 * @param nameB
911 	 *            first position of the identity information. This should be the
912 	 *            first position after the space which delimits the header field
913 	 *            name (e.g. "author" or "committer") from the rest of the
914 	 *            identity line.
915 	 * @return the parsed identity. Never null.
916 	 */
917 	public static PersonIdent parsePersonIdentOnly(final byte[] raw,
918 			final int nameB) {
919 		int stop = nextLF(raw, nameB);
920 		int emailB = nextLF(raw, nameB, '<');
921 		int emailE = nextLF(raw, emailB, '>');
922 		final String name;
923 		final String email;
924 		if (emailE < stop) {
925 			email = decode(raw, emailB, emailE - 1);
926 		} else {
927 			email = "invalid"; //$NON-NLS-1$
928 		}
929 		if (emailB < stop)
930 			name = decode(raw, nameB, emailB - 2);
931 		else
932 			name = decode(raw, nameB, stop);
933 
934 		final MutableInteger ptrout = new MutableInteger();
935 		long when;
936 		int tz;
937 		if (emailE < stop) {
938 			when = parseLongBase10(raw, emailE + 1, ptrout);
939 			tz = parseTimeZoneOffset(raw, ptrout.value);
940 		} else {
941 			when = 0;
942 			tz = 0;
943 		}
944 		return new PersonIdent(name, email, when * 1000L, tz);
945 	}
946 
947 	/**
948 	 * Locate the end of a footer line key string.
949 	 * <p>
950 	 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
951 	 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
952 	 * the first ':'.
953 	 * <p>
954 	 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
955 	 * then this method returns -1.
956 	 *
957 	 * @param raw
958 	 *            buffer to scan.
959 	 * @param ptr
960 	 *            first position within raw to consider as a footer line key.
961 	 * @return position of the ':' which terminates the footer line key if this
962 	 *         is otherwise a valid footer line key; otherwise -1.
963 	 */
964 	public static int endOfFooterLineKey(final byte[] raw, int ptr) {
965 		try {
966 			for (;;) {
967 				final byte c = raw[ptr];
968 				if (footerLineKeyChars[c] == 0) {
969 					if (c == ':')
970 						return ptr;
971 					return -1;
972 				}
973 				ptr++;
974 			}
975 		} catch (ArrayIndexOutOfBoundsException e) {
976 			return -1;
977 		}
978 	}
979 
980 	/**
981 	 * Decode a buffer under UTF-8, if possible.
982 	 *
983 	 * If the byte stream cannot be decoded that way, the platform default is tried
984 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
985 	 *
986 	 * @param buffer
987 	 *            buffer to pull raw bytes from.
988 	 * @return a string representation of the range <code>[start,end)</code>,
989 	 *         after decoding the region through the specified character set.
990 	 */
991 	public static String decode(final byte[] buffer) {
992 		return decode(buffer, 0, buffer.length);
993 	}
994 
995 	/**
996 	 * Decode a buffer under UTF-8, if possible.
997 	 *
998 	 * If the byte stream cannot be decoded that way, the platform default is
999 	 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1000 	 *
1001 	 * @param buffer
1002 	 *            buffer to pull raw bytes from.
1003 	 * @param start
1004 	 *            start position in buffer
1005 	 * @param end
1006 	 *            one position past the last location within the buffer to take
1007 	 *            data from.
1008 	 * @return a string representation of the range <code>[start,end)</code>,
1009 	 *         after decoding the region through the specified character set.
1010 	 */
1011 	public static String decode(final byte[] buffer, final int start,
1012 			final int end) {
1013 		return decode(UTF_8, buffer, start, end);
1014 	}
1015 
1016 	/**
1017 	 * Decode a buffer under the specified character set if possible.
1018 	 *
1019 	 * If the byte stream cannot be decoded that way, the platform default is tried
1020 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1021 	 *
1022 	 * @param cs
1023 	 *            character set to use when decoding the buffer.
1024 	 * @param buffer
1025 	 *            buffer to pull raw bytes from.
1026 	 * @return a string representation of the range <code>[start,end)</code>,
1027 	 *         after decoding the region through the specified character set.
1028 	 */
1029 	public static String decode(final Charset cs, final byte[] buffer) {
1030 		return decode(cs, buffer, 0, buffer.length);
1031 	}
1032 
1033 	/**
1034 	 * Decode a region of the buffer under the specified character set if possible.
1035 	 *
1036 	 * If the byte stream cannot be decoded that way, the platform default is tried
1037 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
1038 	 *
1039 	 * @param cs
1040 	 *            character set to use when decoding the buffer.
1041 	 * @param buffer
1042 	 *            buffer to pull raw bytes from.
1043 	 * @param start
1044 	 *            first position within the buffer to take data from.
1045 	 * @param end
1046 	 *            one position past the last location within the buffer to take
1047 	 *            data from.
1048 	 * @return a string representation of the range <code>[start,end)</code>,
1049 	 *         after decoding the region through the specified character set.
1050 	 */
1051 	public static String decode(final Charset cs, final byte[] buffer,
1052 			final int start, final int end) {
1053 		try {
1054 			return decodeNoFallback(cs, buffer, start, end);
1055 		} catch (CharacterCodingException e) {
1056 			// Fall back to an ISO-8859-1 style encoding. At least all of
1057 			// the bytes will be present in the output.
1058 			//
1059 			return extractBinaryString(buffer, start, end);
1060 		}
1061 	}
1062 
1063 	/**
1064 	 * Decode a region of the buffer under the specified character set if
1065 	 * possible.
1066 	 *
1067 	 * If the byte stream cannot be decoded that way, the platform default is
1068 	 * tried and if that too fails, an exception is thrown.
1069 	 *
1070 	 * @param cs
1071 	 *            character set to use when decoding the buffer.
1072 	 * @param buffer
1073 	 *            buffer to pull raw bytes from.
1074 	 * @param start
1075 	 *            first position within the buffer to take data from.
1076 	 * @param end
1077 	 *            one position past the last location within the buffer to take
1078 	 *            data from.
1079 	 * @return a string representation of the range <code>[start,end)</code>,
1080 	 *         after decoding the region through the specified character set.
1081 	 * @throws java.nio.charset.CharacterCodingException
1082 	 *             the input is not in any of the tested character sets.
1083 	 */
1084 	public static String decodeNoFallback(final Charset cs,
1085 			final byte[] buffer, final int start, final int end)
1086 			throws CharacterCodingException {
1087 		ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
1088 		b.mark();
1089 
1090 		// Try our built-in favorite. The assumption here is that
1091 		// decoding will fail if the data is not actually encoded
1092 		// using that encoder.
1093 		try {
1094 			return decode(b, UTF_8);
1095 		} catch (CharacterCodingException e) {
1096 			b.reset();
1097 		}
1098 
1099 		if (!cs.equals(UTF_8)) {
1100 			// Try the suggested encoding, it might be right since it was
1101 			// provided by the caller.
1102 			try {
1103 				return decode(b, cs);
1104 			} catch (CharacterCodingException e) {
1105 				b.reset();
1106 			}
1107 		}
1108 
1109 		// Try the default character set. A small group of people
1110 		// might actually use the same (or very similar) locale.
1111 		Charset defcs = Charset.defaultCharset();
1112 		if (!defcs.equals(cs) && !defcs.equals(UTF_8)) {
1113 			try {
1114 				return decode(b, defcs);
1115 			} catch (CharacterCodingException e) {
1116 				b.reset();
1117 			}
1118 		}
1119 
1120 		throw new CharacterCodingException();
1121 	}
1122 
1123 	/**
1124 	 * Decode a region of the buffer under the ISO-8859-1 encoding.
1125 	 *
1126 	 * Each byte is treated as a single character in the 8859-1 character
1127 	 * encoding, performing a raw binary-&gt;char conversion.
1128 	 *
1129 	 * @param buffer
1130 	 *            buffer to pull raw bytes from.
1131 	 * @param start
1132 	 *            first position within the buffer to take data from.
1133 	 * @param end
1134 	 *            one position past the last location within the buffer to take
1135 	 *            data from.
1136 	 * @return a string representation of the range <code>[start,end)</code>.
1137 	 */
1138 	public static String extractBinaryString(final byte[] buffer,
1139 			final int start, final int end) {
1140 		final StringBuilder r = new StringBuilder(end - start);
1141 		for (int i = start; i < end; i++)
1142 			r.append((char) (buffer[i] & 0xff));
1143 		return r.toString();
1144 	}
1145 
1146 	private static String decode(final ByteBuffer b, final Charset charset)
1147 			throws CharacterCodingException {
1148 		final CharsetDecoder d = charset.newDecoder();
1149 		d.onMalformedInput(CodingErrorAction.REPORT);
1150 		d.onUnmappableCharacter(CodingErrorAction.REPORT);
1151 		return d.decode(b).toString();
1152 	}
1153 
1154 	/**
1155 	 * Locate the position of the commit message body.
1156 	 *
1157 	 * @param b
1158 	 *            buffer to scan.
1159 	 * @param ptr
1160 	 *            position in buffer to start the scan at. Most callers should
1161 	 *            pass 0 to ensure the scan starts from the beginning of the
1162 	 *            commit buffer.
1163 	 * @return position of the user's message buffer.
1164 	 */
1165 	public static final int commitMessage(final byte[] b, int ptr) {
1166 		final int sz = b.length;
1167 		if (ptr == 0)
1168 			ptr += 46; // skip the "tree ..." line.
1169 		while (ptr < sz && b[ptr] == 'p')
1170 			ptr += 48; // skip this parent.
1171 
1172 		// Skip any remaining header lines, ignoring what their actual
1173 		// header line type is. This is identical to the logic for a tag.
1174 		//
1175 		return tagMessage(b, ptr);
1176 	}
1177 
1178 	/**
1179 	 * Locate the position of the tag message body.
1180 	 *
1181 	 * @param b
1182 	 *            buffer to scan.
1183 	 * @param ptr
1184 	 *            position in buffer to start the scan at. Most callers should
1185 	 *            pass 0 to ensure the scan starts from the beginning of the tag
1186 	 *            buffer.
1187 	 * @return position of the user's message buffer.
1188 	 */
1189 	public static final int tagMessage(final byte[] b, int ptr) {
1190 		final int sz = b.length;
1191 		if (ptr == 0)
1192 			ptr += 48; // skip the "object ..." line.
1193 		while (ptr < sz && b[ptr] != '\n')
1194 			ptr = nextLF(b, ptr);
1195 		if (ptr < sz && b[ptr] == '\n')
1196 			return ptr + 1;
1197 		return -1;
1198 	}
1199 
1200 	/**
1201 	 * Locate the end of a paragraph.
1202 	 * <p>
1203 	 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1204 	 *
1205 	 * @param b
1206 	 *            buffer to scan.
1207 	 * @param start
1208 	 *            position in buffer to start the scan at. Most callers will
1209 	 *            want to pass the first position of the commit message (as
1210 	 *            found by {@link #commitMessage(byte[], int)}.
1211 	 * @return position of the LF at the end of the paragraph;
1212 	 *         <code>b.length</code> if no paragraph end could be located.
1213 	 */
1214 	public static final int endOfParagraph(final byte[] b, final int start) {
1215 		int ptr = start;
1216 		final int sz = b.length;
1217 		while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1218 			ptr = nextLF(b, ptr);
1219 		if (ptr > start && b[ptr - 1] == '\n')
1220 			ptr--;
1221 		if (ptr > start && b[ptr - 1] == '\r')
1222 			ptr--;
1223 		return ptr;
1224 	}
1225 
1226 	/**
1227 	 * Get last index of {@code ch} in raw, trimming spaces.
1228 	 *
1229 	 * @param raw
1230 	 *            buffer to scan.
1231 	 * @param ch
1232 	 *            character to find.
1233 	 * @param pos
1234 	 *            starting position.
1235 	 * @return last index of {@code ch} in raw, trimming spaces.
1236 	 * @since 4.1
1237 	 */
1238 	public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1239 		while (pos >= 0 && raw[pos] == ' ')
1240 			pos--;
1241 
1242 		while (pos >= 0 && raw[pos] != ch)
1243 			pos--;
1244 
1245 		return pos;
1246 	}
1247 
1248 	private static Charset charsetForAlias(String name) {
1249 		return encodingAliases.get(StringUtils.toLowerCase(name));
1250 	}
1251 
1252 	private RawParseUtils() {
1253 		// Don't create instances of a static only utility.
1254 	}
1255 }