View Javadoc
1   /*
2    * Copyright (C) 2008-2009, Google Inc.
3    * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org>
4    * and other copyright owners as documented in the project's IP log.
5    *
6    * This program and the accompanying materials are made available
7    * under the terms of the Eclipse Distribution License v1.0 which
8    * accompanies this distribution, is reproduced below, and is
9    * available at http://www.eclipse.org/org/documents/edl-v10.php
10   *
11   * All rights reserved.
12   *
13   * Redistribution and use in source and binary forms, with or
14   * without modification, are permitted provided that the following
15   * conditions are met:
16   *
17   * - Redistributions of source code must retain the above copyright
18   *   notice, this list of conditions and the following disclaimer.
19   *
20   * - Redistributions in binary form must reproduce the above
21   *   copyright notice, this list of conditions and the following
22   *   disclaimer in the documentation and/or other materials provided
23   *   with the distribution.
24   *
25   * - Neither the name of the Eclipse Foundation, Inc. nor the
26   *   names of its contributors may be used to endorse or promote
27   *   products derived from this software without specific prior
28   *   written permission.
29   *
30   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
31   * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
32   * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
33   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
34   * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
35   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
36   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
37   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
38   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
39   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
40   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
41   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
42   * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
43   */
44  
45  package org.eclipse.jgit.util;
46  
47  import static org.eclipse.jgit.lib.ObjectChecker.author;
48  import static org.eclipse.jgit.lib.ObjectChecker.committer;
49  import static org.eclipse.jgit.lib.ObjectChecker.encoding;
50  import static org.eclipse.jgit.lib.ObjectChecker.tagger;
51  
52  import java.nio.ByteBuffer;
53  import java.nio.charset.CharacterCodingException;
54  import java.nio.charset.Charset;
55  import java.nio.charset.CharsetDecoder;
56  import java.nio.charset.CodingErrorAction;
57  import java.nio.charset.IllegalCharsetNameException;
58  import java.nio.charset.UnsupportedCharsetException;
59  import java.util.Arrays;
60  import java.util.HashMap;
61  import java.util.Map;
62  
63  import org.eclipse.jgit.lib.Constants;
64  import org.eclipse.jgit.lib.PersonIdent;
65  
66  /** Handy utility functions to parse raw object contents. */
67  public final class RawParseUtils {
68  	/**
69  	 * UTF-8 charset constant.
70  	 *
71  	 * @since 2.2
72  	 */
73  	public static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); //$NON-NLS-1$
74  
75  	private static final byte[] digits10;
76  
77  	private static final byte[] digits16;
78  
79  	private static final byte[] footerLineKeyChars;
80  
81  	private static final Map<String, Charset> encodingAliases;
82  
83  	static {
84  		encodingAliases = new HashMap<String, Charset>();
85  		encodingAliases.put("latin-1", Charset.forName("ISO-8859-1")); //$NON-NLS-1$ //$NON-NLS-2$
86  
87  		digits10 = new byte['9' + 1];
88  		Arrays.fill(digits10, (byte) -1);
89  		for (char i = '0'; i <= '9'; i++)
90  			digits10[i] = (byte) (i - '0');
91  
92  		digits16 = new byte['f' + 1];
93  		Arrays.fill(digits16, (byte) -1);
94  		for (char i = '0'; i <= '9'; i++)
95  			digits16[i] = (byte) (i - '0');
96  		for (char i = 'a'; i <= 'f'; i++)
97  			digits16[i] = (byte) ((i - 'a') + 10);
98  		for (char i = 'A'; i <= 'F'; i++)
99  			digits16[i] = (byte) ((i - 'A') + 10);
100 
101 		footerLineKeyChars = new byte['z' + 1];
102 		footerLineKeyChars['-'] = 1;
103 		for (char i = '0'; i <= '9'; i++)
104 			footerLineKeyChars[i] = 1;
105 		for (char i = 'A'; i <= 'Z'; i++)
106 			footerLineKeyChars[i] = 1;
107 		for (char i = 'a'; i <= 'z'; i++)
108 			footerLineKeyChars[i] = 1;
109 	}
110 
111 	/**
112 	 * Determine if b[ptr] matches src.
113 	 *
114 	 * @param b
115 	 *            the buffer to scan.
116 	 * @param ptr
117 	 *            first position within b, this should match src[0].
118 	 * @param src
119 	 *            the buffer to test for equality with b.
120 	 * @return ptr + src.length if b[ptr..src.length] == src; else -1.
121 	 */
122 	public static final int match(final byte[] b, int ptr, final byte[] src) {
123 		if (ptr + src.length > b.length)
124 			return -1;
125 		for (int i = 0; i < src.length; i++, ptr++)
126 			if (b[ptr] != src[i])
127 				return -1;
128 		return ptr;
129 	}
130 
131 	private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5',
132 			'6', '7', '8', '9' };
133 
134 	/**
135 	 * Format a base 10 numeric into a temporary buffer.
136 	 * <p>
137 	 * Formatting is performed backwards. The method starts at offset
138 	 * <code>o-1</code> and ends at <code>o-1-digits</code>, where
139 	 * <code>digits</code> is the number of positions necessary to store the
140 	 * base 10 value.
141 	 * <p>
142 	 * The argument and return values from this method make it easy to chain
143 	 * writing, for example:
144 	 * </p>
145 	 *
146 	 * <pre>
147 	 * final byte[] tmp = new byte[64];
148 	 * int ptr = tmp.length;
149 	 * tmp[--ptr] = '\n';
150 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32);
151 	 * tmp[--ptr] = ' ';
152 	 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18);
153 	 * tmp[--ptr] = 0;
154 	 * final String str = new String(tmp, ptr, tmp.length - ptr);
155 	 * </pre>
156 	 *
157 	 * @param b
158 	 *            buffer to write into.
159 	 * @param o
160 	 *            one offset past the location where writing will begin; writing
161 	 *            proceeds towards lower index values.
162 	 * @param value
163 	 *            the value to store.
164 	 * @return the new offset value <code>o</code>. This is the position of
165 	 *         the last byte written. Additional writing should start at one
166 	 *         position earlier.
167 	 */
168 	public static int formatBase10(final byte[] b, int o, int value) {
169 		if (value == 0) {
170 			b[--o] = '0';
171 			return o;
172 		}
173 		final boolean isneg = value < 0;
174 		if (isneg)
175 			value = -value;
176 		while (value != 0) {
177 			b[--o] = base10byte[value % 10];
178 			value /= 10;
179 		}
180 		if (isneg)
181 			b[--o] = '-';
182 		return o;
183 	}
184 
185 	/**
186 	 * Parse a base 10 numeric from a sequence of ASCII digits into an int.
187 	 * <p>
188 	 * Digit sequences can begin with an optional run of spaces before the
189 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
190 	 * Any other characters will cause the method to stop and return the current
191 	 * result to the caller.
192 	 *
193 	 * @param b
194 	 *            buffer to scan.
195 	 * @param ptr
196 	 *            position within buffer to start parsing digits at.
197 	 * @param ptrResult
198 	 *            optional location to return the new ptr value through. If null
199 	 *            the ptr value will be discarded.
200 	 * @return the value at this location; 0 if the location is not a valid
201 	 *         numeric.
202 	 */
203 	public static final int parseBase10(final byte[] b, int ptr,
204 			final MutableInteger ptrResult) {
205 		int r = 0;
206 		int sign = 0;
207 		try {
208 			final int sz = b.length;
209 			while (ptr < sz && b[ptr] == ' ')
210 				ptr++;
211 			if (ptr >= sz)
212 				return 0;
213 
214 			switch (b[ptr]) {
215 			case '-':
216 				sign = -1;
217 				ptr++;
218 				break;
219 			case '+':
220 				ptr++;
221 				break;
222 			}
223 
224 			while (ptr < sz) {
225 				final byte v = digits10[b[ptr]];
226 				if (v < 0)
227 					break;
228 				r = (r * 10) + v;
229 				ptr++;
230 			}
231 		} catch (ArrayIndexOutOfBoundsException e) {
232 			// Not a valid digit.
233 		}
234 		if (ptrResult != null)
235 			ptrResult.value = ptr;
236 		return sign < 0 ? -r : r;
237 	}
238 
239 	/**
240 	 * Parse a base 10 numeric from a sequence of ASCII digits into a long.
241 	 * <p>
242 	 * Digit sequences can begin with an optional run of spaces before the
243 	 * sequence, and may start with a '+' or a '-' to indicate sign position.
244 	 * Any other characters will cause the method to stop and return the current
245 	 * result to the caller.
246 	 *
247 	 * @param b
248 	 *            buffer to scan.
249 	 * @param ptr
250 	 *            position within buffer to start parsing digits at.
251 	 * @param ptrResult
252 	 *            optional location to return the new ptr value through. If null
253 	 *            the ptr value will be discarded.
254 	 * @return the value at this location; 0 if the location is not a valid
255 	 *         numeric.
256 	 */
257 	public static final long parseLongBase10(final byte[] b, int ptr,
258 			final MutableInteger ptrResult) {
259 		long r = 0;
260 		int sign = 0;
261 		try {
262 			final int sz = b.length;
263 			while (ptr < sz && b[ptr] == ' ')
264 				ptr++;
265 			if (ptr >= sz)
266 				return 0;
267 
268 			switch (b[ptr]) {
269 			case '-':
270 				sign = -1;
271 				ptr++;
272 				break;
273 			case '+':
274 				ptr++;
275 				break;
276 			}
277 
278 			while (ptr < sz) {
279 				final byte v = digits10[b[ptr]];
280 				if (v < 0)
281 					break;
282 				r = (r * 10) + v;
283 				ptr++;
284 			}
285 		} catch (ArrayIndexOutOfBoundsException e) {
286 			// Not a valid digit.
287 		}
288 		if (ptrResult != null)
289 			ptrResult.value = ptr;
290 		return sign < 0 ? -r : r;
291 	}
292 
293 	/**
294 	 * Parse 4 character base 16 (hex) formatted string to unsigned integer.
295 	 * <p>
296 	 * The number is read in network byte order, that is, most significant
297 	 * nybble first.
298 	 *
299 	 * @param bs
300 	 *            buffer to parse digits from; positions {@code [p, p+4)} will
301 	 *            be parsed.
302 	 * @param p
303 	 *            first position within the buffer to parse.
304 	 * @return the integer value.
305 	 * @throws ArrayIndexOutOfBoundsException
306 	 *             if the string is not hex formatted.
307 	 */
308 	public static final int parseHexInt16(final byte[] bs, final int p) {
309 		int r = digits16[bs[p]] << 4;
310 
311 		r |= digits16[bs[p + 1]];
312 		r <<= 4;
313 
314 		r |= digits16[bs[p + 2]];
315 		r <<= 4;
316 
317 		r |= digits16[bs[p + 3]];
318 		if (r < 0)
319 			throw new ArrayIndexOutOfBoundsException();
320 		return r;
321 	}
322 
323 	/**
324 	 * Parse 8 character base 16 (hex) formatted string to unsigned integer.
325 	 * <p>
326 	 * The number is read in network byte order, that is, most significant
327 	 * nybble first.
328 	 *
329 	 * @param bs
330 	 *            buffer to parse digits from; positions {@code [p, p+8)} will
331 	 *            be parsed.
332 	 * @param p
333 	 *            first position within the buffer to parse.
334 	 * @return the integer value.
335 	 * @throws ArrayIndexOutOfBoundsException
336 	 *             if the string is not hex formatted.
337 	 */
338 	public static final int parseHexInt32(final byte[] bs, final int p) {
339 		int r = digits16[bs[p]] << 4;
340 
341 		r |= digits16[bs[p + 1]];
342 		r <<= 4;
343 
344 		r |= digits16[bs[p + 2]];
345 		r <<= 4;
346 
347 		r |= digits16[bs[p + 3]];
348 		r <<= 4;
349 
350 		r |= digits16[bs[p + 4]];
351 		r <<= 4;
352 
353 		r |= digits16[bs[p + 5]];
354 		r <<= 4;
355 
356 		r |= digits16[bs[p + 6]];
357 
358 		final int last = digits16[bs[p + 7]];
359 		if (r < 0 || last < 0)
360 			throw new ArrayIndexOutOfBoundsException();
361 		return (r << 4) | last;
362 	}
363 
364 	/**
365 	 * Parse a single hex digit to its numeric value (0-15).
366 	 *
367 	 * @param digit
368 	 *            hex character to parse.
369 	 * @return numeric value, in the range 0-15.
370 	 * @throws ArrayIndexOutOfBoundsException
371 	 *             if the input digit is not a valid hex digit.
372 	 */
373 	public static final int parseHexInt4(final byte digit) {
374 		final byte r = digits16[digit];
375 		if (r < 0)
376 			throw new ArrayIndexOutOfBoundsException();
377 		return r;
378 	}
379 
380 	/**
381 	 * Parse a Git style timezone string.
382 	 * <p>
383 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
384 	 * lower two positions count minutes, not 100ths of an hour.
385 	 *
386 	 * @param b
387 	 *            buffer to scan.
388 	 * @param ptr
389 	 *            position within buffer to start parsing digits at.
390 	 * @return the timezone at this location, expressed in minutes.
391 	 */
392 	public static final int parseTimeZoneOffset(final byte[] b, int ptr) {
393 		return parseTimeZoneOffset(b, ptr, null);
394 	}
395 
396 	/**
397 	 * Parse a Git style timezone string.
398 	 * <p>
399 	 * The sequence "-0315" will be parsed as the numeric value -195, as the
400 	 * lower two positions count minutes, not 100ths of an hour.
401 	 *
402 	 * @param b
403 	 *            buffer to scan.
404 	 * @param ptr
405 	 *            position within buffer to start parsing digits at.
406 	 * @param ptrResult
407 	 *            optional location to return the new ptr value through. If null
408 	 *            the ptr value will be discarded.
409 	 * @return the timezone at this location, expressed in minutes.
410 	 * @since 4.1
411 	 */
412 	public static final int parseTimeZoneOffset(final byte[] b, int ptr,
413 			MutableInteger ptrResult) {
414 		final int v = parseBase10(b, ptr, ptrResult);
415 		final int tzMins = v % 100;
416 		final int tzHours = v / 100;
417 		return tzHours * 60 + tzMins;
418 	}
419 
420 	/**
421 	 * Locate the first position after a given character.
422 	 *
423 	 * @param b
424 	 *            buffer to scan.
425 	 * @param ptr
426 	 *            position within buffer to start looking for chrA at.
427 	 * @param chrA
428 	 *            character to find.
429 	 * @return new position just after chrA.
430 	 */
431 	public static final int next(final byte[] b, int ptr, final char chrA) {
432 		final int sz = b.length;
433 		while (ptr < sz) {
434 			if (b[ptr++] == chrA)
435 				return ptr;
436 		}
437 		return ptr;
438 	}
439 
440 	/**
441 	 * Locate the first position after the next LF.
442 	 * <p>
443 	 * This method stops on the first '\n' it finds.
444 	 *
445 	 * @param b
446 	 *            buffer to scan.
447 	 * @param ptr
448 	 *            position within buffer to start looking for LF at.
449 	 * @return new position just after the first LF found.
450 	 */
451 	public static final int nextLF(final byte[] b, int ptr) {
452 		return next(b, ptr, '\n');
453 	}
454 
455 	/**
456 	 * Locate the first position after either the given character or LF.
457 	 * <p>
458 	 * This method stops on the first match it finds from either chrA or '\n'.
459 	 *
460 	 * @param b
461 	 *            buffer to scan.
462 	 * @param ptr
463 	 *            position within buffer to start looking for chrA or LF at.
464 	 * @param chrA
465 	 *            character to find.
466 	 * @return new position just after the first chrA or LF to be found.
467 	 */
468 	public static final int nextLF(final byte[] b, int ptr, final char chrA) {
469 		final int sz = b.length;
470 		while (ptr < sz) {
471 			final byte c = b[ptr++];
472 			if (c == chrA || c == '\n')
473 				return ptr;
474 		}
475 		return ptr;
476 	}
477 
478 	/**
479 	 * Locate the first position before a given character.
480 	 *
481 	 * @param b
482 	 *            buffer to scan.
483 	 * @param ptr
484 	 *            position within buffer to start looking for chrA at.
485 	 * @param chrA
486 	 *            character to find.
487 	 * @return new position just before chrA, -1 for not found
488 	 */
489 	public static final int prev(final byte[] b, int ptr, final char chrA) {
490 		if (ptr == b.length)
491 			--ptr;
492 		while (ptr >= 0) {
493 			if (b[ptr--] == chrA)
494 				return ptr;
495 		}
496 		return ptr;
497 	}
498 
499 	/**
500 	 * Locate the first position before the previous LF.
501 	 * <p>
502 	 * This method stops on the first '\n' it finds.
503 	 *
504 	 * @param b
505 	 *            buffer to scan.
506 	 * @param ptr
507 	 *            position within buffer to start looking for LF at.
508 	 * @return new position just before the first LF found, -1 for not found
509 	 */
510 	public static final int prevLF(final byte[] b, int ptr) {
511 		return prev(b, ptr, '\n');
512 	}
513 
514 	/**
515 	 * Locate the previous position before either the given character or LF.
516 	 * <p>
517 	 * This method stops on the first match it finds from either chrA or '\n'.
518 	 *
519 	 * @param b
520 	 *            buffer to scan.
521 	 * @param ptr
522 	 *            position within buffer to start looking for chrA or LF at.
523 	 * @param chrA
524 	 *            character to find.
525 	 * @return new position just before the first chrA or LF to be found, -1 for
526 	 *         not found
527 	 */
528 	public static final int prevLF(final byte[] b, int ptr, final char chrA) {
529 		if (ptr == b.length)
530 			--ptr;
531 		while (ptr >= 0) {
532 			final byte c = b[ptr--];
533 			if (c == chrA || c == '\n')
534 				return ptr;
535 		}
536 		return ptr;
537 	}
538 
539 	/**
540 	 * Index the region between <code>[ptr, end)</code> to find line starts.
541 	 * <p>
542 	 * The returned list is 1 indexed. Index 0 contains
543 	 * {@link Integer#MIN_VALUE} to pad the list out.
544 	 * <p>
545 	 * Using a 1 indexed list means that line numbers can be directly accessed
546 	 * from the list, so <code>list.get(1)</code> (aka get line 1) returns
547 	 * <code>ptr</code>.
548 	 * <p>
549 	 * The last element (index <code>map.size()-1</code>) always contains
550 	 * <code>end</code>.
551 	 *
552 	 * @param buf
553 	 *            buffer to scan.
554 	 * @param ptr
555 	 *            position within the buffer corresponding to the first byte of
556 	 *            line 1.
557 	 * @param end
558 	 *            1 past the end of the content within <code>buf</code>.
559 	 * @return a line map indexing the start position of each line.
560 	 */
561 	public static final IntList lineMap(final byte[] buf, int ptr, int end) {
562 		// Experimentally derived from multiple source repositories
563 		// the average number of bytes/line is 36. Its a rough guess
564 		// to initially size our map close to the target.
565 		//
566 		final IntList map = new IntList((end - ptr) / 36);
567 		map.fillTo(1, Integer.MIN_VALUE);
568 		for (; ptr < end; ptr = nextLF(buf, ptr))
569 			map.add(ptr);
570 		map.add(end);
571 		return map;
572 	}
573 
574 	/**
575 	 * Locate the "author " header line data.
576 	 *
577 	 * @param b
578 	 *            buffer to scan.
579 	 * @param ptr
580 	 *            position in buffer to start the scan at. Most callers should
581 	 *            pass 0 to ensure the scan starts from the beginning of the
582 	 *            commit buffer and does not accidentally look at message body.
583 	 * @return position just after the space in "author ", so the first
584 	 *         character of the author's name. If no author header can be
585 	 *         located -1 is returned.
586 	 */
587 	public static final int author(final byte[] b, int ptr) {
588 		final int sz = b.length;
589 		if (ptr == 0)
590 			ptr += 46; // skip the "tree ..." line.
591 		while (ptr < sz && b[ptr] == 'p')
592 			ptr += 48; // skip this parent.
593 		return match(b, ptr, author);
594 	}
595 
596 	/**
597 	 * Locate the "committer " header line data.
598 	 *
599 	 * @param b
600 	 *            buffer to scan.
601 	 * @param ptr
602 	 *            position in buffer to start the scan at. Most callers should
603 	 *            pass 0 to ensure the scan starts from the beginning of the
604 	 *            commit buffer and does not accidentally look at message body.
605 	 * @return position just after the space in "committer ", so the first
606 	 *         character of the committer's name. If no committer header can be
607 	 *         located -1 is returned.
608 	 */
609 	public static final int committer(final byte[] b, int ptr) {
610 		final int sz = b.length;
611 		if (ptr == 0)
612 			ptr += 46; // skip the "tree ..." line.
613 		while (ptr < sz && b[ptr] == 'p')
614 			ptr += 48; // skip this parent.
615 		if (ptr < sz && b[ptr] == 'a')
616 			ptr = nextLF(b, ptr);
617 		return match(b, ptr, committer);
618 	}
619 
620 	/**
621 	 * Locate the "tagger " header line data.
622 	 *
623 	 * @param b
624 	 *            buffer to scan.
625 	 * @param ptr
626 	 *            position in buffer to start the scan at. Most callers should
627 	 *            pass 0 to ensure the scan starts from the beginning of the tag
628 	 *            buffer and does not accidentally look at message body.
629 	 * @return position just after the space in "tagger ", so the first
630 	 *         character of the tagger's name. If no tagger header can be
631 	 *         located -1 is returned.
632 	 */
633 	public static final int tagger(final byte[] b, int ptr) {
634 		final int sz = b.length;
635 		if (ptr == 0)
636 			ptr += 48; // skip the "object ..." line.
637 		while (ptr < sz) {
638 			if (b[ptr] == '\n')
639 				return -1;
640 			final int m = match(b, ptr, tagger);
641 			if (m >= 0)
642 				return m;
643 			ptr = nextLF(b, ptr);
644 		}
645 		return -1;
646 	}
647 
648 	/**
649 	 * Locate the "encoding " header line.
650 	 *
651 	 * @param b
652 	 *            buffer to scan.
653 	 * @param ptr
654 	 *            position in buffer to start the scan at. Most callers should
655 	 *            pass 0 to ensure the scan starts from the beginning of the
656 	 *            buffer and does not accidentally look at the message body.
657 	 * @return position just after the space in "encoding ", so the first
658 	 *         character of the encoding's name. If no encoding header can be
659 	 *         located -1 is returned (and UTF-8 should be assumed).
660 	 */
661 	public static final int encoding(final byte[] b, int ptr) {
662 		final int sz = b.length;
663 		while (ptr < sz) {
664 			if (b[ptr] == '\n')
665 				return -1;
666 			if (b[ptr] == 'e')
667 				break;
668 			ptr = nextLF(b, ptr);
669 		}
670 		return match(b, ptr, encoding);
671 	}
672 
673 	/**
674 	 * Parse the "encoding " header into a character set reference.
675 	 * <p>
676 	 * Locates the "encoding " header (if present) by first calling
677 	 * {@link #encoding(byte[], int)} and then returns the proper character set
678 	 * to apply to this buffer to evaluate its contents as character data.
679 	 * <p>
680 	 * If no encoding header is present, {@link Constants#CHARSET} is assumed.
681 	 *
682 	 * @param b
683 	 *            buffer to scan.
684 	 * @return the Java character set representation. Never null.
685 	 */
686 	public static Charset parseEncoding(final byte[] b) {
687 		final int enc = encoding(b, 0);
688 		if (enc < 0)
689 			return Constants.CHARSET;
690 		final int lf = nextLF(b, enc);
691 		String decoded = decode(Constants.CHARSET, b, enc, lf - 1);
692 		try {
693 			return Charset.forName(decoded);
694 		} catch (IllegalCharsetNameException badName) {
695 			Charset aliased = charsetForAlias(decoded);
696 			if (aliased != null)
697 				return aliased;
698 			throw badName;
699 		} catch (UnsupportedCharsetException badName) {
700 			Charset aliased = charsetForAlias(decoded);
701 			if (aliased != null)
702 				return aliased;
703 			throw badName;
704 		}
705 	}
706 
707 	/**
708 	 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent.
709 	 * <p>
710 	 * Leading spaces won't be trimmed from the string, i.e. will show up in the
711 	 * parsed name afterwards.
712 	 *
713 	 * @param in
714 	 *            the string to parse a name from.
715 	 * @return the parsed identity or null in case the identity could not be
716 	 *         parsed.
717 	 */
718 	public static PersonIdent parsePersonIdent(final String in) {
719 		return parsePersonIdent(Constants.encode(in), 0);
720 	}
721 
722 	/**
723 	 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent.
724 	 * <p>
725 	 * When passing in a value for <code>nameB</code> callers should use the
726 	 * return value of {@link #author(byte[], int)} or
727 	 * {@link #committer(byte[], int)}, as these methods provide the proper
728 	 * position within the buffer.
729 	 *
730 	 * @param raw
731 	 *            the buffer to parse character data from.
732 	 * @param nameB
733 	 *            first position of the identity information. This should be the
734 	 *            first position after the space which delimits the header field
735 	 *            name (e.g. "author" or "committer") from the rest of the
736 	 *            identity line.
737 	 * @return the parsed identity or null in case the identity could not be
738 	 *         parsed.
739 	 */
740 	public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) {
741 		final Charset cs = parseEncoding(raw);
742 		final int emailB = nextLF(raw, nameB, '<');
743 		final int emailE = nextLF(raw, emailB, '>');
744 		if (emailB >= raw.length || raw[emailB] == '\n' ||
745 				(emailE >= raw.length - 1 && raw[emailE - 1] != '>'))
746 			return null;
747 
748 		final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ?
749 				emailB - 2 : emailB - 1;
750 		final String name = decode(cs, raw, nameB, nameEnd);
751 		final String email = decode(cs, raw, emailB, emailE - 1);
752 
753 		// Start searching from end of line, as after first name-email pair,
754 		// another name-email pair may occur. We will ignore all kinds of
755 		// "junk" following the first email.
756 		//
757 		// We've to use (emailE - 1) for the case that raw[email] is LF,
758 		// otherwise we would run too far. "-2" is necessary to position
759 		// before the LF in case of LF termination resp. the penultimate
760 		// character if there is no trailing LF.
761 		final int tzBegin = lastIndexOfTrim(raw, ' ',
762 				nextLF(raw, emailE - 1) - 2) + 1;
763 		if (tzBegin <= emailE) // No time/zone, still valid
764 			return new PersonIdent(name, email, 0, 0);
765 
766 		final int whenBegin = Math.max(emailE,
767 				lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1);
768 		if (whenBegin >= tzBegin - 1) // No time/zone, still valid
769 			return new PersonIdent(name, email, 0, 0);
770 
771 		final long when = parseLongBase10(raw, whenBegin, null);
772 		final int tz = parseTimeZoneOffset(raw, tzBegin);
773 		return new PersonIdent(name, email, when * 1000L, tz);
774 	}
775 
776 	/**
777 	 * Parse a name data (e.g. as within a reflog) into a PersonIdent.
778 	 * <p>
779 	 * When passing in a value for <code>nameB</code> callers should use the
780 	 * return value of {@link #author(byte[], int)} or
781 	 * {@link #committer(byte[], int)}, as these methods provide the proper
782 	 * position within the buffer.
783 	 *
784 	 * @param raw
785 	 *            the buffer to parse character data from.
786 	 * @param nameB
787 	 *            first position of the identity information. This should be the
788 	 *            first position after the space which delimits the header field
789 	 *            name (e.g. "author" or "committer") from the rest of the
790 	 *            identity line.
791 	 * @return the parsed identity. Never null.
792 	 */
793 	public static PersonIdent parsePersonIdentOnly(final byte[] raw,
794 			final int nameB) {
795 		int stop = nextLF(raw, nameB);
796 		int emailB = nextLF(raw, nameB, '<');
797 		int emailE = nextLF(raw, emailB, '>');
798 		final String name;
799 		final String email;
800 		if (emailE < stop) {
801 			email = decode(raw, emailB, emailE - 1);
802 		} else {
803 			email = "invalid"; //$NON-NLS-1$
804 		}
805 		if (emailB < stop)
806 			name = decode(raw, nameB, emailB - 2);
807 		else
808 			name = decode(raw, nameB, stop);
809 
810 		final MutableInteger ptrout = new MutableInteger();
811 		long when;
812 		int tz;
813 		if (emailE < stop) {
814 			when = parseLongBase10(raw, emailE + 1, ptrout);
815 			tz = parseTimeZoneOffset(raw, ptrout.value);
816 		} else {
817 			when = 0;
818 			tz = 0;
819 		}
820 		return new PersonIdent(name, email, when * 1000L, tz);
821 	}
822 
823 	/**
824 	 * Locate the end of a footer line key string.
825 	 * <p>
826 	 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g.
827 	 * "Signed-off-by: A. U. Thor\n") then this method returns the position of
828 	 * the first ':'.
829 	 * <p>
830 	 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:}
831 	 * then this method returns -1.
832 	 *
833 	 * @param raw
834 	 *            buffer to scan.
835 	 * @param ptr
836 	 *            first position within raw to consider as a footer line key.
837 	 * @return position of the ':' which terminates the footer line key if this
838 	 *         is otherwise a valid footer line key; otherwise -1.
839 	 */
840 	public static int endOfFooterLineKey(final byte[] raw, int ptr) {
841 		try {
842 			for (;;) {
843 				final byte c = raw[ptr];
844 				if (footerLineKeyChars[c] == 0) {
845 					if (c == ':')
846 						return ptr;
847 					return -1;
848 				}
849 				ptr++;
850 			}
851 		} catch (ArrayIndexOutOfBoundsException e) {
852 			return -1;
853 		}
854 	}
855 
856 	/**
857 	 * Decode a buffer under UTF-8, if possible.
858 	 *
859 	 * If the byte stream cannot be decoded that way, the platform default is tried
860 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
861 	 *
862 	 * @param buffer
863 	 *            buffer to pull raw bytes from.
864 	 * @return a string representation of the range <code>[start,end)</code>,
865 	 *         after decoding the region through the specified character set.
866 	 */
867 	public static String decode(final byte[] buffer) {
868 		return decode(buffer, 0, buffer.length);
869 	}
870 
871 	/**
872 	 * Decode a buffer under UTF-8, if possible.
873 	 *
874 	 * If the byte stream cannot be decoded that way, the platform default is
875 	 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
876 	 *
877 	 * @param buffer
878 	 *            buffer to pull raw bytes from.
879 	 * @param start
880 	 *            start position in buffer
881 	 * @param end
882 	 *            one position past the last location within the buffer to take
883 	 *            data from.
884 	 * @return a string representation of the range <code>[start,end)</code>,
885 	 *         after decoding the region through the specified character set.
886 	 */
887 	public static String decode(final byte[] buffer, final int start,
888 			final int end) {
889 		return decode(Constants.CHARSET, buffer, start, end);
890 	}
891 
892 	/**
893 	 * Decode a buffer under the specified character set if possible.
894 	 *
895 	 * If the byte stream cannot be decoded that way, the platform default is tried
896 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
897 	 *
898 	 * @param cs
899 	 *            character set to use when decoding the buffer.
900 	 * @param buffer
901 	 *            buffer to pull raw bytes from.
902 	 * @return a string representation of the range <code>[start,end)</code>,
903 	 *         after decoding the region through the specified character set.
904 	 */
905 	public static String decode(final Charset cs, final byte[] buffer) {
906 		return decode(cs, buffer, 0, buffer.length);
907 	}
908 
909 	/**
910 	 * Decode a region of the buffer under the specified character set if possible.
911 	 *
912 	 * If the byte stream cannot be decoded that way, the platform default is tried
913 	 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried.
914 	 *
915 	 * @param cs
916 	 *            character set to use when decoding the buffer.
917 	 * @param buffer
918 	 *            buffer to pull raw bytes from.
919 	 * @param start
920 	 *            first position within the buffer to take data from.
921 	 * @param end
922 	 *            one position past the last location within the buffer to take
923 	 *            data from.
924 	 * @return a string representation of the range <code>[start,end)</code>,
925 	 *         after decoding the region through the specified character set.
926 	 */
927 	public static String decode(final Charset cs, final byte[] buffer,
928 			final int start, final int end) {
929 		try {
930 			return decodeNoFallback(cs, buffer, start, end);
931 		} catch (CharacterCodingException e) {
932 			// Fall back to an ISO-8859-1 style encoding. At least all of
933 			// the bytes will be present in the output.
934 			//
935 			return extractBinaryString(buffer, start, end);
936 		}
937 	}
938 
939 	/**
940 	 * Decode a region of the buffer under the specified character set if
941 	 * possible.
942 	 *
943 	 * If the byte stream cannot be decoded that way, the platform default is
944 	 * tried and if that too fails, an exception is thrown.
945 	 *
946 	 * @param cs
947 	 *            character set to use when decoding the buffer.
948 	 * @param buffer
949 	 *            buffer to pull raw bytes from.
950 	 * @param start
951 	 *            first position within the buffer to take data from.
952 	 * @param end
953 	 *            one position past the last location within the buffer to take
954 	 *            data from.
955 	 * @return a string representation of the range <code>[start,end)</code>,
956 	 *         after decoding the region through the specified character set.
957 	 * @throws CharacterCodingException
958 	 *             the input is not in any of the tested character sets.
959 	 */
960 	public static String decodeNoFallback(final Charset cs,
961 			final byte[] buffer, final int start, final int end)
962 			throws CharacterCodingException {
963 		final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start);
964 		b.mark();
965 
966 		// Try our built-in favorite. The assumption here is that
967 		// decoding will fail if the data is not actually encoded
968 		// using that encoder.
969 		//
970 		try {
971 			return decode(b, Constants.CHARSET);
972 		} catch (CharacterCodingException e) {
973 			b.reset();
974 		}
975 
976 		if (!cs.equals(Constants.CHARSET)) {
977 			// Try the suggested encoding, it might be right since it was
978 			// provided by the caller.
979 			//
980 			try {
981 				return decode(b, cs);
982 			} catch (CharacterCodingException e) {
983 				b.reset();
984 			}
985 		}
986 
987 		// Try the default character set. A small group of people
988 		// might actually use the same (or very similar) locale.
989 		//
990 		final Charset defcs = Charset.defaultCharset();
991 		if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) {
992 			try {
993 				return decode(b, defcs);
994 			} catch (CharacterCodingException e) {
995 				b.reset();
996 			}
997 		}
998 
999 		throw new CharacterCodingException();
1000 	}
1001 
1002 	/**
1003 	 * Decode a region of the buffer under the ISO-8859-1 encoding.
1004 	 *
1005 	 * Each byte is treated as a single character in the 8859-1 character
1006 	 * encoding, performing a raw binary-&gt;char conversion.
1007 	 *
1008 	 * @param buffer
1009 	 *            buffer to pull raw bytes from.
1010 	 * @param start
1011 	 *            first position within the buffer to take data from.
1012 	 * @param end
1013 	 *            one position past the last location within the buffer to take
1014 	 *            data from.
1015 	 * @return a string representation of the range <code>[start,end)</code>.
1016 	 */
1017 	public static String extractBinaryString(final byte[] buffer,
1018 			final int start, final int end) {
1019 		final StringBuilder r = new StringBuilder(end - start);
1020 		for (int i = start; i < end; i++)
1021 			r.append((char) (buffer[i] & 0xff));
1022 		return r.toString();
1023 	}
1024 
1025 	private static String decode(final ByteBuffer b, final Charset charset)
1026 			throws CharacterCodingException {
1027 		final CharsetDecoder d = charset.newDecoder();
1028 		d.onMalformedInput(CodingErrorAction.REPORT);
1029 		d.onUnmappableCharacter(CodingErrorAction.REPORT);
1030 		return d.decode(b).toString();
1031 	}
1032 
1033 	/**
1034 	 * Locate the position of the commit message body.
1035 	 *
1036 	 * @param b
1037 	 *            buffer to scan.
1038 	 * @param ptr
1039 	 *            position in buffer to start the scan at. Most callers should
1040 	 *            pass 0 to ensure the scan starts from the beginning of the
1041 	 *            commit buffer.
1042 	 * @return position of the user's message buffer.
1043 	 */
1044 	public static final int commitMessage(final byte[] b, int ptr) {
1045 		final int sz = b.length;
1046 		if (ptr == 0)
1047 			ptr += 46; // skip the "tree ..." line.
1048 		while (ptr < sz && b[ptr] == 'p')
1049 			ptr += 48; // skip this parent.
1050 
1051 		// Skip any remaining header lines, ignoring what their actual
1052 		// header line type is. This is identical to the logic for a tag.
1053 		//
1054 		return tagMessage(b, ptr);
1055 	}
1056 
1057 	/**
1058 	 * Locate the position of the tag message body.
1059 	 *
1060 	 * @param b
1061 	 *            buffer to scan.
1062 	 * @param ptr
1063 	 *            position in buffer to start the scan at. Most callers should
1064 	 *            pass 0 to ensure the scan starts from the beginning of the tag
1065 	 *            buffer.
1066 	 * @return position of the user's message buffer.
1067 	 */
1068 	public static final int tagMessage(final byte[] b, int ptr) {
1069 		final int sz = b.length;
1070 		if (ptr == 0)
1071 			ptr += 48; // skip the "object ..." line.
1072 		while (ptr < sz && b[ptr] != '\n')
1073 			ptr = nextLF(b, ptr);
1074 		if (ptr < sz && b[ptr] == '\n')
1075 			return ptr + 1;
1076 		return -1;
1077 	}
1078 
1079 	/**
1080 	 * Locate the end of a paragraph.
1081 	 * <p>
1082 	 * A paragraph is ended by two consecutive LF bytes or CRLF pairs
1083 	 *
1084 	 * @param b
1085 	 *            buffer to scan.
1086 	 * @param start
1087 	 *            position in buffer to start the scan at. Most callers will
1088 	 *            want to pass the first position of the commit message (as
1089 	 *            found by {@link #commitMessage(byte[], int)}.
1090 	 * @return position of the LF at the end of the paragraph;
1091 	 *         <code>b.length</code> if no paragraph end could be located.
1092 	 */
1093 	public static final int endOfParagraph(final byte[] b, final int start) {
1094 		int ptr = start;
1095 		final int sz = b.length;
1096 		while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r'))
1097 			ptr = nextLF(b, ptr);
1098 		if (ptr > start && b[ptr - 1] == '\n')
1099 			ptr--;
1100 		if (ptr > start && b[ptr - 1] == '\r')
1101 			ptr--;
1102 		return ptr;
1103 	}
1104 
1105 	/**
1106 	 * @param raw
1107 	 *            buffer to scan.
1108 	 * @param ch
1109 	 *            character to find.
1110 	 * @param pos
1111 	 *            starting position.
1112 	 * @return last index of ch in raw, trimming spaces.
1113 	 * @since 4.1
1114 	 */
1115 	public static int lastIndexOfTrim(byte[] raw, char ch, int pos) {
1116 		while (pos >= 0 && raw[pos] == ' ')
1117 			pos--;
1118 
1119 		while (pos >= 0 && raw[pos] != ch)
1120 			pos--;
1121 
1122 		return pos;
1123 	}
1124 
1125 	private static Charset charsetForAlias(String name) {
1126 		return encodingAliases.get(StringUtils.toLowerCase(name));
1127 	}
1128 
1129 	private RawParseUtils() {
1130 		// Don't create instances of a static only utility.
1131 	}
1132 }