1 /* 2 * Copyright (C) 2008-2009, Google Inc. 3 * Copyright (C) 2006-2008, Shawn O. Pearce <spearce@spearce.org> 4 * and other copyright owners as documented in the project's IP log. 5 * 6 * This program and the accompanying materials are made available 7 * under the terms of the Eclipse Distribution License v1.0 which 8 * accompanies this distribution, is reproduced below, and is 9 * available at http://www.eclipse.org/org/documents/edl-v10.php 10 * 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or 14 * without modification, are permitted provided that the following 15 * conditions are met: 16 * 17 * - Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials provided 23 * with the distribution. 24 * 25 * - Neither the name of the Eclipse Foundation, Inc. nor the 26 * names of its contributors may be used to endorse or promote 27 * products derived from this software without specific prior 28 * written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 */ 44 45 package org.eclipse.jgit.util; 46 47 import static org.eclipse.jgit.lib.ObjectChecker.author; 48 import static org.eclipse.jgit.lib.ObjectChecker.committer; 49 import static org.eclipse.jgit.lib.ObjectChecker.encoding; 50 import static org.eclipse.jgit.lib.ObjectChecker.tagger; 51 52 import java.nio.ByteBuffer; 53 import java.nio.charset.CharacterCodingException; 54 import java.nio.charset.Charset; 55 import java.nio.charset.CharsetDecoder; 56 import java.nio.charset.CodingErrorAction; 57 import java.nio.charset.IllegalCharsetNameException; 58 import java.nio.charset.UnsupportedCharsetException; 59 import java.util.Arrays; 60 import java.util.HashMap; 61 import java.util.Map; 62 63 import org.eclipse.jgit.lib.Constants; 64 import org.eclipse.jgit.lib.PersonIdent; 65 66 /** Handy utility functions to parse raw object contents. */ 67 public final class RawParseUtils { 68 /** 69 * UTF-8 charset constant. 70 * 71 * @since 2.2 72 */ 73 public static final Charset UTF8_CHARSET = Charset.forName("UTF-8"); //$NON-NLS-1$ 74 75 private static final byte[] digits10; 76 77 private static final byte[] digits16; 78 79 private static final byte[] footerLineKeyChars; 80 81 private static final Map<String, Charset> encodingAliases; 82 83 static { 84 encodingAliases = new HashMap<String, Charset>(); 85 encodingAliases.put("latin-1", Charset.forName("ISO-8859-1")); //$NON-NLS-1$ //$NON-NLS-2$ 86 87 digits10 = new byte['9' + 1]; 88 Arrays.fill(digits10, (byte) -1); 89 for (char i = '0'; i <= '9'; i++) 90 digits10[i] = (byte) (i - '0'); 91 92 digits16 = new byte['f' + 1]; 93 Arrays.fill(digits16, (byte) -1); 94 for (char i = '0'; i <= '9'; i++) 95 digits16[i] = (byte) (i - '0'); 96 for (char i = 'a'; i <= 'f'; i++) 97 digits16[i] = (byte) ((i - 'a') + 10); 98 for (char i = 'A'; i <= 'F'; i++) 99 digits16[i] = (byte) ((i - 'A') + 10); 100 101 footerLineKeyChars = new byte['z' + 1]; 102 footerLineKeyChars['-'] = 1; 103 for (char i = '0'; i <= '9'; i++) 104 footerLineKeyChars[i] = 1; 105 for (char i = 'A'; i <= 'Z'; i++) 106 footerLineKeyChars[i] = 1; 107 for (char i = 'a'; i <= 'z'; i++) 108 footerLineKeyChars[i] = 1; 109 } 110 111 /** 112 * Determine if b[ptr] matches src. 113 * 114 * @param b 115 * the buffer to scan. 116 * @param ptr 117 * first position within b, this should match src[0]. 118 * @param src 119 * the buffer to test for equality with b. 120 * @return ptr + src.length if b[ptr..src.length] == src; else -1. 121 */ 122 public static final int match(final byte[] b, int ptr, final byte[] src) { 123 if (ptr + src.length > b.length) 124 return -1; 125 for (int i = 0; i < src.length; i++, ptr++) 126 if (b[ptr] != src[i]) 127 return -1; 128 return ptr; 129 } 130 131 private static final byte[] base10byte = { '0', '1', '2', '3', '4', '5', 132 '6', '7', '8', '9' }; 133 134 /** 135 * Format a base 10 numeric into a temporary buffer. 136 * <p> 137 * Formatting is performed backwards. The method starts at offset 138 * <code>o-1</code> and ends at <code>o-1-digits</code>, where 139 * <code>digits</code> is the number of positions necessary to store the 140 * base 10 value. 141 * <p> 142 * The argument and return values from this method make it easy to chain 143 * writing, for example: 144 * </p> 145 * 146 * <pre> 147 * final byte[] tmp = new byte[64]; 148 * int ptr = tmp.length; 149 * tmp[--ptr] = '\n'; 150 * ptr = RawParseUtils.formatBase10(tmp, ptr, 32); 151 * tmp[--ptr] = ' '; 152 * ptr = RawParseUtils.formatBase10(tmp, ptr, 18); 153 * tmp[--ptr] = 0; 154 * final String str = new String(tmp, ptr, tmp.length - ptr); 155 * </pre> 156 * 157 * @param b 158 * buffer to write into. 159 * @param o 160 * one offset past the location where writing will begin; writing 161 * proceeds towards lower index values. 162 * @param value 163 * the value to store. 164 * @return the new offset value <code>o</code>. This is the position of 165 * the last byte written. Additional writing should start at one 166 * position earlier. 167 */ 168 public static int formatBase10(final byte[] b, int o, int value) { 169 if (value == 0) { 170 b[--o] = '0'; 171 return o; 172 } 173 final boolean isneg = value < 0; 174 if (isneg) 175 value = -value; 176 while (value != 0) { 177 b[--o] = base10byte[value % 10]; 178 value /= 10; 179 } 180 if (isneg) 181 b[--o] = '-'; 182 return o; 183 } 184 185 /** 186 * Parse a base 10 numeric from a sequence of ASCII digits into an int. 187 * <p> 188 * Digit sequences can begin with an optional run of spaces before the 189 * sequence, and may start with a '+' or a '-' to indicate sign position. 190 * Any other characters will cause the method to stop and return the current 191 * result to the caller. 192 * 193 * @param b 194 * buffer to scan. 195 * @param ptr 196 * position within buffer to start parsing digits at. 197 * @param ptrResult 198 * optional location to return the new ptr value through. If null 199 * the ptr value will be discarded. 200 * @return the value at this location; 0 if the location is not a valid 201 * numeric. 202 */ 203 public static final int parseBase10(final byte[] b, int ptr, 204 final MutableInteger ptrResult) { 205 int r = 0; 206 int sign = 0; 207 try { 208 final int sz = b.length; 209 while (ptr < sz && b[ptr] == ' ') 210 ptr++; 211 if (ptr >= sz) 212 return 0; 213 214 switch (b[ptr]) { 215 case '-': 216 sign = -1; 217 ptr++; 218 break; 219 case '+': 220 ptr++; 221 break; 222 } 223 224 while (ptr < sz) { 225 final byte v = digits10[b[ptr]]; 226 if (v < 0) 227 break; 228 r = (r * 10) + v; 229 ptr++; 230 } 231 } catch (ArrayIndexOutOfBoundsException e) { 232 // Not a valid digit. 233 } 234 if (ptrResult != null) 235 ptrResult.value = ptr; 236 return sign < 0 ? -r : r; 237 } 238 239 /** 240 * Parse a base 10 numeric from a sequence of ASCII digits into a long. 241 * <p> 242 * Digit sequences can begin with an optional run of spaces before the 243 * sequence, and may start with a '+' or a '-' to indicate sign position. 244 * Any other characters will cause the method to stop and return the current 245 * result to the caller. 246 * 247 * @param b 248 * buffer to scan. 249 * @param ptr 250 * position within buffer to start parsing digits at. 251 * @param ptrResult 252 * optional location to return the new ptr value through. If null 253 * the ptr value will be discarded. 254 * @return the value at this location; 0 if the location is not a valid 255 * numeric. 256 */ 257 public static final long parseLongBase10(final byte[] b, int ptr, 258 final MutableInteger ptrResult) { 259 long r = 0; 260 int sign = 0; 261 try { 262 final int sz = b.length; 263 while (ptr < sz && b[ptr] == ' ') 264 ptr++; 265 if (ptr >= sz) 266 return 0; 267 268 switch (b[ptr]) { 269 case '-': 270 sign = -1; 271 ptr++; 272 break; 273 case '+': 274 ptr++; 275 break; 276 } 277 278 while (ptr < sz) { 279 final byte v = digits10[b[ptr]]; 280 if (v < 0) 281 break; 282 r = (r * 10) + v; 283 ptr++; 284 } 285 } catch (ArrayIndexOutOfBoundsException e) { 286 // Not a valid digit. 287 } 288 if (ptrResult != null) 289 ptrResult.value = ptr; 290 return sign < 0 ? -r : r; 291 } 292 293 /** 294 * Parse 4 character base 16 (hex) formatted string to unsigned integer. 295 * <p> 296 * The number is read in network byte order, that is, most significant 297 * nybble first. 298 * 299 * @param bs 300 * buffer to parse digits from; positions {@code [p, p+4)} will 301 * be parsed. 302 * @param p 303 * first position within the buffer to parse. 304 * @return the integer value. 305 * @throws ArrayIndexOutOfBoundsException 306 * if the string is not hex formatted. 307 */ 308 public static final int parseHexInt16(final byte[] bs, final int p) { 309 int r = digits16[bs[p]] << 4; 310 311 r |= digits16[bs[p + 1]]; 312 r <<= 4; 313 314 r |= digits16[bs[p + 2]]; 315 r <<= 4; 316 317 r |= digits16[bs[p + 3]]; 318 if (r < 0) 319 throw new ArrayIndexOutOfBoundsException(); 320 return r; 321 } 322 323 /** 324 * Parse 8 character base 16 (hex) formatted string to unsigned integer. 325 * <p> 326 * The number is read in network byte order, that is, most significant 327 * nybble first. 328 * 329 * @param bs 330 * buffer to parse digits from; positions {@code [p, p+8)} will 331 * be parsed. 332 * @param p 333 * first position within the buffer to parse. 334 * @return the integer value. 335 * @throws ArrayIndexOutOfBoundsException 336 * if the string is not hex formatted. 337 */ 338 public static final int parseHexInt32(final byte[] bs, final int p) { 339 int r = digits16[bs[p]] << 4; 340 341 r |= digits16[bs[p + 1]]; 342 r <<= 4; 343 344 r |= digits16[bs[p + 2]]; 345 r <<= 4; 346 347 r |= digits16[bs[p + 3]]; 348 r <<= 4; 349 350 r |= digits16[bs[p + 4]]; 351 r <<= 4; 352 353 r |= digits16[bs[p + 5]]; 354 r <<= 4; 355 356 r |= digits16[bs[p + 6]]; 357 358 final int last = digits16[bs[p + 7]]; 359 if (r < 0 || last < 0) 360 throw new ArrayIndexOutOfBoundsException(); 361 return (r << 4) | last; 362 } 363 364 /** 365 * Parse a single hex digit to its numeric value (0-15). 366 * 367 * @param digit 368 * hex character to parse. 369 * @return numeric value, in the range 0-15. 370 * @throws ArrayIndexOutOfBoundsException 371 * if the input digit is not a valid hex digit. 372 */ 373 public static final int parseHexInt4(final byte digit) { 374 final byte r = digits16[digit]; 375 if (r < 0) 376 throw new ArrayIndexOutOfBoundsException(); 377 return r; 378 } 379 380 /** 381 * Parse a Git style timezone string. 382 * <p> 383 * The sequence "-0315" will be parsed as the numeric value -195, as the 384 * lower two positions count minutes, not 100ths of an hour. 385 * 386 * @param b 387 * buffer to scan. 388 * @param ptr 389 * position within buffer to start parsing digits at. 390 * @return the timezone at this location, expressed in minutes. 391 */ 392 public static final int parseTimeZoneOffset(final byte[] b, int ptr) { 393 return parseTimeZoneOffset(b, ptr, null); 394 } 395 396 /** 397 * Parse a Git style timezone string. 398 * <p> 399 * The sequence "-0315" will be parsed as the numeric value -195, as the 400 * lower two positions count minutes, not 100ths of an hour. 401 * 402 * @param b 403 * buffer to scan. 404 * @param ptr 405 * position within buffer to start parsing digits at. 406 * @param ptrResult 407 * optional location to return the new ptr value through. If null 408 * the ptr value will be discarded. 409 * @return the timezone at this location, expressed in minutes. 410 * @since 4.1 411 */ 412 public static final int parseTimeZoneOffset(final byte[] b, int ptr, 413 MutableInteger ptrResult) { 414 final int v = parseBase10(b, ptr, ptrResult); 415 final int tzMins = v % 100; 416 final int tzHours = v / 100; 417 return tzHours * 60 + tzMins; 418 } 419 420 /** 421 * Locate the first position after a given character. 422 * 423 * @param b 424 * buffer to scan. 425 * @param ptr 426 * position within buffer to start looking for chrA at. 427 * @param chrA 428 * character to find. 429 * @return new position just after chrA. 430 */ 431 public static final int next(final byte[] b, int ptr, final char chrA) { 432 final int sz = b.length; 433 while (ptr < sz) { 434 if (b[ptr++] == chrA) 435 return ptr; 436 } 437 return ptr; 438 } 439 440 /** 441 * Locate the first position after the next LF. 442 * <p> 443 * This method stops on the first '\n' it finds. 444 * 445 * @param b 446 * buffer to scan. 447 * @param ptr 448 * position within buffer to start looking for LF at. 449 * @return new position just after the first LF found. 450 */ 451 public static final int nextLF(final byte[] b, int ptr) { 452 return next(b, ptr, '\n'); 453 } 454 455 /** 456 * Locate the first position after either the given character or LF. 457 * <p> 458 * This method stops on the first match it finds from either chrA or '\n'. 459 * 460 * @param b 461 * buffer to scan. 462 * @param ptr 463 * position within buffer to start looking for chrA or LF at. 464 * @param chrA 465 * character to find. 466 * @return new position just after the first chrA or LF to be found. 467 */ 468 public static final int nextLF(final byte[] b, int ptr, final char chrA) { 469 final int sz = b.length; 470 while (ptr < sz) { 471 final byte c = b[ptr++]; 472 if (c == chrA || c == '\n') 473 return ptr; 474 } 475 return ptr; 476 } 477 478 /** 479 * Locate the first position before a given character. 480 * 481 * @param b 482 * buffer to scan. 483 * @param ptr 484 * position within buffer to start looking for chrA at. 485 * @param chrA 486 * character to find. 487 * @return new position just before chrA, -1 for not found 488 */ 489 public static final int prev(final byte[] b, int ptr, final char chrA) { 490 if (ptr == b.length) 491 --ptr; 492 while (ptr >= 0) { 493 if (b[ptr--] == chrA) 494 return ptr; 495 } 496 return ptr; 497 } 498 499 /** 500 * Locate the first position before the previous LF. 501 * <p> 502 * This method stops on the first '\n' it finds. 503 * 504 * @param b 505 * buffer to scan. 506 * @param ptr 507 * position within buffer to start looking for LF at. 508 * @return new position just before the first LF found, -1 for not found 509 */ 510 public static final int prevLF(final byte[] b, int ptr) { 511 return prev(b, ptr, '\n'); 512 } 513 514 /** 515 * Locate the previous position before either the given character or LF. 516 * <p> 517 * This method stops on the first match it finds from either chrA or '\n'. 518 * 519 * @param b 520 * buffer to scan. 521 * @param ptr 522 * position within buffer to start looking for chrA or LF at. 523 * @param chrA 524 * character to find. 525 * @return new position just before the first chrA or LF to be found, -1 for 526 * not found 527 */ 528 public static final int prevLF(final byte[] b, int ptr, final char chrA) { 529 if (ptr == b.length) 530 --ptr; 531 while (ptr >= 0) { 532 final byte c = b[ptr--]; 533 if (c == chrA || c == '\n') 534 return ptr; 535 } 536 return ptr; 537 } 538 539 /** 540 * Index the region between <code>[ptr, end)</code> to find line starts. 541 * <p> 542 * The returned list is 1 indexed. Index 0 contains 543 * {@link Integer#MIN_VALUE} to pad the list out. 544 * <p> 545 * Using a 1 indexed list means that line numbers can be directly accessed 546 * from the list, so <code>list.get(1)</code> (aka get line 1) returns 547 * <code>ptr</code>. 548 * <p> 549 * The last element (index <code>map.size()-1</code>) always contains 550 * <code>end</code>. 551 * 552 * @param buf 553 * buffer to scan. 554 * @param ptr 555 * position within the buffer corresponding to the first byte of 556 * line 1. 557 * @param end 558 * 1 past the end of the content within <code>buf</code>. 559 * @return a line map indexing the start position of each line. 560 */ 561 public static final IntList lineMap(final byte[] buf, int ptr, int end) { 562 // Experimentally derived from multiple source repositories 563 // the average number of bytes/line is 36. Its a rough guess 564 // to initially size our map close to the target. 565 // 566 final IntList map = new IntList((end - ptr) / 36); 567 map.fillTo(1, Integer.MIN_VALUE); 568 for (; ptr < end; ptr = nextLF(buf, ptr)) 569 map.add(ptr); 570 map.add(end); 571 return map; 572 } 573 574 /** 575 * Locate the "author " header line data. 576 * 577 * @param b 578 * buffer to scan. 579 * @param ptr 580 * position in buffer to start the scan at. Most callers should 581 * pass 0 to ensure the scan starts from the beginning of the 582 * commit buffer and does not accidentally look at message body. 583 * @return position just after the space in "author ", so the first 584 * character of the author's name. If no author header can be 585 * located -1 is returned. 586 */ 587 public static final int author(final byte[] b, int ptr) { 588 final int sz = b.length; 589 if (ptr == 0) 590 ptr += 46; // skip the "tree ..." line. 591 while (ptr < sz && b[ptr] == 'p') 592 ptr += 48; // skip this parent. 593 return match(b, ptr, author); 594 } 595 596 /** 597 * Locate the "committer " header line data. 598 * 599 * @param b 600 * buffer to scan. 601 * @param ptr 602 * position in buffer to start the scan at. Most callers should 603 * pass 0 to ensure the scan starts from the beginning of the 604 * commit buffer and does not accidentally look at message body. 605 * @return position just after the space in "committer ", so the first 606 * character of the committer's name. If no committer header can be 607 * located -1 is returned. 608 */ 609 public static final int committer(final byte[] b, int ptr) { 610 final int sz = b.length; 611 if (ptr == 0) 612 ptr += 46; // skip the "tree ..." line. 613 while (ptr < sz && b[ptr] == 'p') 614 ptr += 48; // skip this parent. 615 if (ptr < sz && b[ptr] == 'a') 616 ptr = nextLF(b, ptr); 617 return match(b, ptr, committer); 618 } 619 620 /** 621 * Locate the "tagger " header line data. 622 * 623 * @param b 624 * buffer to scan. 625 * @param ptr 626 * position in buffer to start the scan at. Most callers should 627 * pass 0 to ensure the scan starts from the beginning of the tag 628 * buffer and does not accidentally look at message body. 629 * @return position just after the space in "tagger ", so the first 630 * character of the tagger's name. If no tagger header can be 631 * located -1 is returned. 632 */ 633 public static final int tagger(final byte[] b, int ptr) { 634 final int sz = b.length; 635 if (ptr == 0) 636 ptr += 48; // skip the "object ..." line. 637 while (ptr < sz) { 638 if (b[ptr] == '\n') 639 return -1; 640 final int m = match(b, ptr, tagger); 641 if (m >= 0) 642 return m; 643 ptr = nextLF(b, ptr); 644 } 645 return -1; 646 } 647 648 /** 649 * Locate the "encoding " header line. 650 * 651 * @param b 652 * buffer to scan. 653 * @param ptr 654 * position in buffer to start the scan at. Most callers should 655 * pass 0 to ensure the scan starts from the beginning of the 656 * buffer and does not accidentally look at the message body. 657 * @return position just after the space in "encoding ", so the first 658 * character of the encoding's name. If no encoding header can be 659 * located -1 is returned (and UTF-8 should be assumed). 660 */ 661 public static final int encoding(final byte[] b, int ptr) { 662 final int sz = b.length; 663 while (ptr < sz) { 664 if (b[ptr] == '\n') 665 return -1; 666 if (b[ptr] == 'e') 667 break; 668 ptr = nextLF(b, ptr); 669 } 670 return match(b, ptr, encoding); 671 } 672 673 /** 674 * Parse the "encoding " header into a character set reference. 675 * <p> 676 * Locates the "encoding " header (if present) by first calling 677 * {@link #encoding(byte[], int)} and then returns the proper character set 678 * to apply to this buffer to evaluate its contents as character data. 679 * <p> 680 * If no encoding header is present, {@link Constants#CHARSET} is assumed. 681 * 682 * @param b 683 * buffer to scan. 684 * @return the Java character set representation. Never null. 685 */ 686 public static Charset parseEncoding(final byte[] b) { 687 final int enc = encoding(b, 0); 688 if (enc < 0) 689 return Constants.CHARSET; 690 final int lf = nextLF(b, enc); 691 String decoded = decode(Constants.CHARSET, b, enc, lf - 1); 692 try { 693 return Charset.forName(decoded); 694 } catch (IllegalCharsetNameException badName) { 695 Charset aliased = charsetForAlias(decoded); 696 if (aliased != null) 697 return aliased; 698 throw badName; 699 } catch (UnsupportedCharsetException badName) { 700 Charset aliased = charsetForAlias(decoded); 701 if (aliased != null) 702 return aliased; 703 throw badName; 704 } 705 } 706 707 /** 708 * Parse a name string (e.g. author, committer, tagger) into a PersonIdent. 709 * <p> 710 * Leading spaces won't be trimmed from the string, i.e. will show up in the 711 * parsed name afterwards. 712 * 713 * @param in 714 * the string to parse a name from. 715 * @return the parsed identity or null in case the identity could not be 716 * parsed. 717 */ 718 public static PersonIdent parsePersonIdent(final String in) { 719 return parsePersonIdent(Constants.encode(in), 0); 720 } 721 722 /** 723 * Parse a name line (e.g. author, committer, tagger) into a PersonIdent. 724 * <p> 725 * When passing in a value for <code>nameB</code> callers should use the 726 * return value of {@link #author(byte[], int)} or 727 * {@link #committer(byte[], int)}, as these methods provide the proper 728 * position within the buffer. 729 * 730 * @param raw 731 * the buffer to parse character data from. 732 * @param nameB 733 * first position of the identity information. This should be the 734 * first position after the space which delimits the header field 735 * name (e.g. "author" or "committer") from the rest of the 736 * identity line. 737 * @return the parsed identity or null in case the identity could not be 738 * parsed. 739 */ 740 public static PersonIdent parsePersonIdent(final byte[] raw, final int nameB) { 741 final Charset cs = parseEncoding(raw); 742 final int emailB = nextLF(raw, nameB, '<'); 743 final int emailE = nextLF(raw, emailB, '>'); 744 if (emailB >= raw.length || raw[emailB] == '\n' || 745 (emailE >= raw.length - 1 && raw[emailE - 1] != '>')) 746 return null; 747 748 final int nameEnd = emailB - 2 >= nameB && raw[emailB - 2] == ' ' ? 749 emailB - 2 : emailB - 1; 750 final String name = decode(cs, raw, nameB, nameEnd); 751 final String email = decode(cs, raw, emailB, emailE - 1); 752 753 // Start searching from end of line, as after first name-email pair, 754 // another name-email pair may occur. We will ignore all kinds of 755 // "junk" following the first email. 756 // 757 // We've to use (emailE - 1) for the case that raw[email] is LF, 758 // otherwise we would run too far. "-2" is necessary to position 759 // before the LF in case of LF termination resp. the penultimate 760 // character if there is no trailing LF. 761 final int tzBegin = lastIndexOfTrim(raw, ' ', 762 nextLF(raw, emailE - 1) - 2) + 1; 763 if (tzBegin <= emailE) // No time/zone, still valid 764 return new PersonIdent(name, email, 0, 0); 765 766 final int whenBegin = Math.max(emailE, 767 lastIndexOfTrim(raw, ' ', tzBegin - 1) + 1); 768 if (whenBegin >= tzBegin - 1) // No time/zone, still valid 769 return new PersonIdent(name, email, 0, 0); 770 771 final long when = parseLongBase10(raw, whenBegin, null); 772 final int tz = parseTimeZoneOffset(raw, tzBegin); 773 return new PersonIdent(name, email, when * 1000L, tz); 774 } 775 776 /** 777 * Parse a name data (e.g. as within a reflog) into a PersonIdent. 778 * <p> 779 * When passing in a value for <code>nameB</code> callers should use the 780 * return value of {@link #author(byte[], int)} or 781 * {@link #committer(byte[], int)}, as these methods provide the proper 782 * position within the buffer. 783 * 784 * @param raw 785 * the buffer to parse character data from. 786 * @param nameB 787 * first position of the identity information. This should be the 788 * first position after the space which delimits the header field 789 * name (e.g. "author" or "committer") from the rest of the 790 * identity line. 791 * @return the parsed identity. Never null. 792 */ 793 public static PersonIdent parsePersonIdentOnly(final byte[] raw, 794 final int nameB) { 795 int stop = nextLF(raw, nameB); 796 int emailB = nextLF(raw, nameB, '<'); 797 int emailE = nextLF(raw, emailB, '>'); 798 final String name; 799 final String email; 800 if (emailE < stop) { 801 email = decode(raw, emailB, emailE - 1); 802 } else { 803 email = "invalid"; //$NON-NLS-1$ 804 } 805 if (emailB < stop) 806 name = decode(raw, nameB, emailB - 2); 807 else 808 name = decode(raw, nameB, stop); 809 810 final MutableInteger ptrout = new MutableInteger(); 811 long when; 812 int tz; 813 if (emailE < stop) { 814 when = parseLongBase10(raw, emailE + 1, ptrout); 815 tz = parseTimeZoneOffset(raw, ptrout.value); 816 } else { 817 when = 0; 818 tz = 0; 819 } 820 return new PersonIdent(name, email, when * 1000L, tz); 821 } 822 823 /** 824 * Locate the end of a footer line key string. 825 * <p> 826 * If the region at {@code raw[ptr]} matches {@code ^[A-Za-z0-9-]+:} (e.g. 827 * "Signed-off-by: A. U. Thor\n") then this method returns the position of 828 * the first ':'. 829 * <p> 830 * If the region at {@code raw[ptr]} does not match {@code ^[A-Za-z0-9-]+:} 831 * then this method returns -1. 832 * 833 * @param raw 834 * buffer to scan. 835 * @param ptr 836 * first position within raw to consider as a footer line key. 837 * @return position of the ':' which terminates the footer line key if this 838 * is otherwise a valid footer line key; otherwise -1. 839 */ 840 public static int endOfFooterLineKey(final byte[] raw, int ptr) { 841 try { 842 for (;;) { 843 final byte c = raw[ptr]; 844 if (footerLineKeyChars[c] == 0) { 845 if (c == ':') 846 return ptr; 847 return -1; 848 } 849 ptr++; 850 } 851 } catch (ArrayIndexOutOfBoundsException e) { 852 return -1; 853 } 854 } 855 856 /** 857 * Decode a buffer under UTF-8, if possible. 858 * 859 * If the byte stream cannot be decoded that way, the platform default is tried 860 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 861 * 862 * @param buffer 863 * buffer to pull raw bytes from. 864 * @return a string representation of the range <code>[start,end)</code>, 865 * after decoding the region through the specified character set. 866 */ 867 public static String decode(final byte[] buffer) { 868 return decode(buffer, 0, buffer.length); 869 } 870 871 /** 872 * Decode a buffer under UTF-8, if possible. 873 * 874 * If the byte stream cannot be decoded that way, the platform default is 875 * tried and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 876 * 877 * @param buffer 878 * buffer to pull raw bytes from. 879 * @param start 880 * start position in buffer 881 * @param end 882 * one position past the last location within the buffer to take 883 * data from. 884 * @return a string representation of the range <code>[start,end)</code>, 885 * after decoding the region through the specified character set. 886 */ 887 public static String decode(final byte[] buffer, final int start, 888 final int end) { 889 return decode(Constants.CHARSET, buffer, start, end); 890 } 891 892 /** 893 * Decode a buffer under the specified character set if possible. 894 * 895 * If the byte stream cannot be decoded that way, the platform default is tried 896 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 897 * 898 * @param cs 899 * character set to use when decoding the buffer. 900 * @param buffer 901 * buffer to pull raw bytes from. 902 * @return a string representation of the range <code>[start,end)</code>, 903 * after decoding the region through the specified character set. 904 */ 905 public static String decode(final Charset cs, final byte[] buffer) { 906 return decode(cs, buffer, 0, buffer.length); 907 } 908 909 /** 910 * Decode a region of the buffer under the specified character set if possible. 911 * 912 * If the byte stream cannot be decoded that way, the platform default is tried 913 * and if that too fails, the fail-safe ISO-8859-1 encoding is tried. 914 * 915 * @param cs 916 * character set to use when decoding the buffer. 917 * @param buffer 918 * buffer to pull raw bytes from. 919 * @param start 920 * first position within the buffer to take data from. 921 * @param end 922 * one position past the last location within the buffer to take 923 * data from. 924 * @return a string representation of the range <code>[start,end)</code>, 925 * after decoding the region through the specified character set. 926 */ 927 public static String decode(final Charset cs, final byte[] buffer, 928 final int start, final int end) { 929 try { 930 return decodeNoFallback(cs, buffer, start, end); 931 } catch (CharacterCodingException e) { 932 // Fall back to an ISO-8859-1 style encoding. At least all of 933 // the bytes will be present in the output. 934 // 935 return extractBinaryString(buffer, start, end); 936 } 937 } 938 939 /** 940 * Decode a region of the buffer under the specified character set if 941 * possible. 942 * 943 * If the byte stream cannot be decoded that way, the platform default is 944 * tried and if that too fails, an exception is thrown. 945 * 946 * @param cs 947 * character set to use when decoding the buffer. 948 * @param buffer 949 * buffer to pull raw bytes from. 950 * @param start 951 * first position within the buffer to take data from. 952 * @param end 953 * one position past the last location within the buffer to take 954 * data from. 955 * @return a string representation of the range <code>[start,end)</code>, 956 * after decoding the region through the specified character set. 957 * @throws CharacterCodingException 958 * the input is not in any of the tested character sets. 959 */ 960 public static String decodeNoFallback(final Charset cs, 961 final byte[] buffer, final int start, final int end) 962 throws CharacterCodingException { 963 final ByteBuffer b = ByteBuffer.wrap(buffer, start, end - start); 964 b.mark(); 965 966 // Try our built-in favorite. The assumption here is that 967 // decoding will fail if the data is not actually encoded 968 // using that encoder. 969 // 970 try { 971 return decode(b, Constants.CHARSET); 972 } catch (CharacterCodingException e) { 973 b.reset(); 974 } 975 976 if (!cs.equals(Constants.CHARSET)) { 977 // Try the suggested encoding, it might be right since it was 978 // provided by the caller. 979 // 980 try { 981 return decode(b, cs); 982 } catch (CharacterCodingException e) { 983 b.reset(); 984 } 985 } 986 987 // Try the default character set. A small group of people 988 // might actually use the same (or very similar) locale. 989 // 990 final Charset defcs = Charset.defaultCharset(); 991 if (!defcs.equals(cs) && !defcs.equals(Constants.CHARSET)) { 992 try { 993 return decode(b, defcs); 994 } catch (CharacterCodingException e) { 995 b.reset(); 996 } 997 } 998 999 throw new CharacterCodingException(); 1000 } 1001 1002 /** 1003 * Decode a region of the buffer under the ISO-8859-1 encoding. 1004 * 1005 * Each byte is treated as a single character in the 8859-1 character 1006 * encoding, performing a raw binary->char conversion. 1007 * 1008 * @param buffer 1009 * buffer to pull raw bytes from. 1010 * @param start 1011 * first position within the buffer to take data from. 1012 * @param end 1013 * one position past the last location within the buffer to take 1014 * data from. 1015 * @return a string representation of the range <code>[start,end)</code>. 1016 */ 1017 public static String extractBinaryString(final byte[] buffer, 1018 final int start, final int end) { 1019 final StringBuilder r = new StringBuilder(end - start); 1020 for (int i = start; i < end; i++) 1021 r.append((char) (buffer[i] & 0xff)); 1022 return r.toString(); 1023 } 1024 1025 private static String decode(final ByteBuffer b, final Charset charset) 1026 throws CharacterCodingException { 1027 final CharsetDecoder d = charset.newDecoder(); 1028 d.onMalformedInput(CodingErrorAction.REPORT); 1029 d.onUnmappableCharacter(CodingErrorAction.REPORT); 1030 return d.decode(b).toString(); 1031 } 1032 1033 /** 1034 * Locate the position of the commit message body. 1035 * 1036 * @param b 1037 * buffer to scan. 1038 * @param ptr 1039 * position in buffer to start the scan at. Most callers should 1040 * pass 0 to ensure the scan starts from the beginning of the 1041 * commit buffer. 1042 * @return position of the user's message buffer. 1043 */ 1044 public static final int commitMessage(final byte[] b, int ptr) { 1045 final int sz = b.length; 1046 if (ptr == 0) 1047 ptr += 46; // skip the "tree ..." line. 1048 while (ptr < sz && b[ptr] == 'p') 1049 ptr += 48; // skip this parent. 1050 1051 // Skip any remaining header lines, ignoring what their actual 1052 // header line type is. This is identical to the logic for a tag. 1053 // 1054 return tagMessage(b, ptr); 1055 } 1056 1057 /** 1058 * Locate the position of the tag message body. 1059 * 1060 * @param b 1061 * buffer to scan. 1062 * @param ptr 1063 * position in buffer to start the scan at. Most callers should 1064 * pass 0 to ensure the scan starts from the beginning of the tag 1065 * buffer. 1066 * @return position of the user's message buffer. 1067 */ 1068 public static final int tagMessage(final byte[] b, int ptr) { 1069 final int sz = b.length; 1070 if (ptr == 0) 1071 ptr += 48; // skip the "object ..." line. 1072 while (ptr < sz && b[ptr] != '\n') 1073 ptr = nextLF(b, ptr); 1074 if (ptr < sz && b[ptr] == '\n') 1075 return ptr + 1; 1076 return -1; 1077 } 1078 1079 /** 1080 * Locate the end of a paragraph. 1081 * <p> 1082 * A paragraph is ended by two consecutive LF bytes or CRLF pairs 1083 * 1084 * @param b 1085 * buffer to scan. 1086 * @param start 1087 * position in buffer to start the scan at. Most callers will 1088 * want to pass the first position of the commit message (as 1089 * found by {@link #commitMessage(byte[], int)}. 1090 * @return position of the LF at the end of the paragraph; 1091 * <code>b.length</code> if no paragraph end could be located. 1092 */ 1093 public static final int endOfParagraph(final byte[] b, final int start) { 1094 int ptr = start; 1095 final int sz = b.length; 1096 while (ptr < sz && (b[ptr] != '\n' && b[ptr] != '\r')) 1097 ptr = nextLF(b, ptr); 1098 if (ptr > start && b[ptr - 1] == '\n') 1099 ptr--; 1100 if (ptr > start && b[ptr - 1] == '\r') 1101 ptr--; 1102 return ptr; 1103 } 1104 1105 /** 1106 * @param raw 1107 * buffer to scan. 1108 * @param ch 1109 * character to find. 1110 * @param pos 1111 * starting position. 1112 * @return last index of ch in raw, trimming spaces. 1113 * @since 4.1 1114 */ 1115 public static int lastIndexOfTrim(byte[] raw, char ch, int pos) { 1116 while (pos >= 0 && raw[pos] == ' ') 1117 pos--; 1118 1119 while (pos >= 0 && raw[pos] != ch) 1120 pos--; 1121 1122 return pos; 1123 } 1124 1125 private static Charset charsetForAlias(String name) { 1126 return encodingAliases.get(StringUtils.toLowerCase(name)); 1127 } 1128 1129 private RawParseUtils() { 1130 // Don't create instances of a static only utility. 1131 } 1132 }