1 /* 2 * Copyright (C) 2009, Google Inc. 3 * Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de> 4 * and other copyright owners as documented in the project's IP log. 5 * 6 * This program and the accompanying materials are made available 7 * under the terms of the Eclipse Distribution License v1.0 which 8 * accompanies this distribution, is reproduced below, and is 9 * available at http://www.eclipse.org/org/documents/edl-v10.php 10 * 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or 14 * without modification, are permitted provided that the following 15 * conditions are met: 16 * 17 * - Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials provided 23 * with the distribution. 24 * 25 * - Neither the name of the Eclipse Foundation, Inc. nor the 26 * names of its contributors may be used to endorse or promote 27 * products derived from this software without specific prior 28 * written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 */ 44 45 package org.eclipse.jgit.diff; 46 47 import java.io.EOFException; 48 import java.io.File; 49 import java.io.IOException; 50 import java.io.InputStream; 51 import java.io.OutputStream; 52 53 import org.eclipse.jgit.errors.BinaryBlobException; 54 import org.eclipse.jgit.errors.LargeObjectException; 55 import org.eclipse.jgit.lib.ObjectLoader; 56 import org.eclipse.jgit.util.IO; 57 import org.eclipse.jgit.util.IntList; 58 import org.eclipse.jgit.util.RawParseUtils; 59 60 /** 61 * A Sequence supporting UNIX formatted text in byte[] format. 62 * <p> 63 * Elements of the sequence are the lines of the file, as delimited by the UNIX 64 * newline character ('\n'). The file content is treated as 8 bit binary text, 65 * with no assumptions or requirements on character encoding. 66 * <p> 67 * Note that the first line of the file is element 0, as defined by the Sequence 68 * interface API. Traditionally in a text editor a patch file the first line is 69 * line number 1. Callers may need to subtract 1 prior to invoking methods if 70 * they are converting from "line number" to "element index". 71 */ 72 public class RawText extends Sequence { 73 /** A RawText of length 0 */ 74 public static final RawText EMPTY_TEXT = new RawText(new byte[0]); 75 76 /** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */ 77 static final int FIRST_FEW_BYTES = 8000; 78 79 /** The file content for this sequence. */ 80 protected final byte[] content; 81 82 /** Map of line number to starting position within {@link #content}. */ 83 protected final IntList lines; 84 85 /** 86 * Create a new sequence from an existing content byte array. 87 * <p> 88 * The entire array (indexes 0 through length-1) is used as the content. 89 * 90 * @param input 91 * the content array. The array is never modified, so passing 92 * through cached arrays is safe. 93 */ 94 public RawText(final byte[] input) { 95 content = input; 96 lines = RawParseUtils.lineMap(content, 0, content.length); 97 } 98 99 /** 100 * Create a new sequence from a file. 101 * <p> 102 * The entire file contents are used. 103 * 104 * @param file 105 * the text file. 106 * @throws java.io.IOException 107 * if Exceptions occur while reading the file 108 */ 109 public RawText(File file) throws IOException { 110 this(IO.readFully(file)); 111 } 112 113 /** 114 * @return the raw, unprocessed content read. 115 * @since 4.11 116 */ 117 public byte[] getRawContent() { 118 return content; 119 } 120 121 /** @return total number of items in the sequence. */ 122 /** {@inheritDoc} */ 123 @Override 124 public int size() { 125 // The line map is always 2 entries larger than the number of lines in 126 // the file. Index 0 is padded out/unused. The last index is the total 127 // length of the buffer, and acts as a sentinel. 128 // 129 return lines.size() - 2; 130 } 131 132 /** 133 * Write a specific line to the output stream, without its trailing LF. 134 * <p> 135 * The specified line is copied as-is, with no character encoding 136 * translation performed. 137 * <p> 138 * If the specified line ends with an LF ('\n'), the LF is <b>not</b> 139 * copied. It is up to the caller to write the LF, if desired, between 140 * output lines. 141 * 142 * @param out 143 * stream to copy the line data onto. 144 * @param i 145 * index of the line to extract. Note this is 0-based, so line 146 * number 1 is actually index 0. 147 * @throws java.io.IOException 148 * the stream write operation failed. 149 */ 150 public void writeLine(final OutputStream out, final int i) 151 throws IOException { 152 int start = getStart(i); 153 int end = getEnd(i); 154 if (content[end - 1] == '\n') 155 end--; 156 out.write(content, start, end - start); 157 } 158 159 /** 160 * Determine if the file ends with a LF ('\n'). 161 * 162 * @return true if the last line has an LF; false otherwise. 163 */ 164 public boolean isMissingNewlineAtEnd() { 165 final int end = lines.get(lines.size() - 1); 166 if (end == 0) 167 return true; 168 return content[end - 1] != '\n'; 169 } 170 171 /** 172 * Get the text for a single line. 173 * 174 * @param i 175 * index of the line to extract. Note this is 0-based, so line 176 * number 1 is actually index 0. 177 * @return the text for the line, without a trailing LF. 178 */ 179 public String getString(int i) { 180 return getString(i, i + 1, true); 181 } 182 183 /** 184 * Get the text for a region of lines. 185 * 186 * @param begin 187 * index of the first line to extract. Note this is 0-based, so 188 * line number 1 is actually index 0. 189 * @param end 190 * index of one past the last line to extract. 191 * @param dropLF 192 * if true the trailing LF ('\n') of the last returned line is 193 * dropped, if present. 194 * @return the text for lines {@code [begin, end)}. 195 */ 196 public String getString(int begin, int end, boolean dropLF) { 197 if (begin == end) 198 return ""; //$NON-NLS-1$ 199 200 int s = getStart(begin); 201 int e = getEnd(end - 1); 202 if (dropLF && content[e - 1] == '\n') 203 e--; 204 return decode(s, e); 205 } 206 207 /** 208 * Decode a region of the text into a String. 209 * 210 * The default implementation of this method tries to guess the character 211 * set by considering UTF-8, the platform default, and falling back on 212 * ISO-8859-1 if neither of those can correctly decode the region given. 213 * 214 * @param start 215 * first byte of the content to decode. 216 * @param end 217 * one past the last byte of the content to decode. 218 * @return the region {@code [start, end)} decoded as a String. 219 */ 220 protected String decode(int start, int end) { 221 return RawParseUtils.decode(content, start, end); 222 } 223 224 private int getStart(final int i) { 225 return lines.get(i + 1); 226 } 227 228 private int getEnd(final int i) { 229 return lines.get(i + 2); 230 } 231 232 /** 233 * Determine heuristically whether a byte array represents binary (as 234 * opposed to text) content. 235 * 236 * @param raw 237 * the raw file content. 238 * @return true if raw is likely to be a binary file, false otherwise 239 */ 240 public static boolean isBinary(byte[] raw) { 241 return isBinary(raw, raw.length); 242 } 243 244 /** 245 * Determine heuristically whether the bytes contained in a stream 246 * represents binary (as opposed to text) content. 247 * 248 * Note: Do not further use this stream after having called this method! The 249 * stream may not be fully read and will be left at an unknown position 250 * after consuming an unknown number of bytes. The caller is responsible for 251 * closing the stream. 252 * 253 * @param raw 254 * input stream containing the raw file content. 255 * @return true if raw is likely to be a binary file, false otherwise 256 * @throws java.io.IOException 257 * if input stream could not be read 258 */ 259 public static boolean isBinary(InputStream raw) throws IOException { 260 final byte[] buffer = new byte[FIRST_FEW_BYTES]; 261 int cnt = 0; 262 while (cnt < buffer.length) { 263 final int n = raw.read(buffer, cnt, buffer.length - cnt); 264 if (n == -1) 265 break; 266 cnt += n; 267 } 268 return isBinary(buffer, cnt); 269 } 270 271 /** 272 * Determine heuristically whether a byte array represents binary (as 273 * opposed to text) content. 274 * 275 * @param raw 276 * the raw file content. 277 * @param length 278 * number of bytes in {@code raw} to evaluate. This should be 279 * {@code raw.length} unless {@code raw} was over-allocated by 280 * the caller. 281 * @return true if raw is likely to be a binary file, false otherwise 282 */ 283 public static boolean isBinary(byte[] raw, int length) { 284 // Same heuristic as C Git 285 if (length > FIRST_FEW_BYTES) 286 length = FIRST_FEW_BYTES; 287 for (int ptr = 0; ptr < length; ptr++) 288 if (raw[ptr] == '\0') 289 return true; 290 291 return false; 292 } 293 294 /** 295 * Get the line delimiter for the first line. 296 * 297 * @since 2.0 298 * @return the line delimiter or <code>null</code> 299 */ 300 public String getLineDelimiter() { 301 if (size() == 0) 302 return null; 303 int e = getEnd(0); 304 if (content[e - 1] != '\n') 305 return null; 306 if (content.length > 1 && e > 1 && content[e - 2] == '\r') 307 return "\r\n"; //$NON-NLS-1$ 308 else 309 return "\n"; //$NON-NLS-1$ 310 } 311 312 /** 313 * Read a blob object into RawText, or throw BinaryBlobException if the blob 314 * is binary. 315 * 316 * @param ldr 317 * the ObjectLoader for the blob 318 * @param threshold 319 * if the blob is larger than this size, it is always assumed to 320 * be binary. 321 * @since 4.10 322 * @return the RawText representing the blob. 323 * @throws org.eclipse.jgit.errors.BinaryBlobException 324 * if the blob contains binary data. 325 * @throws java.io.IOException 326 * if the input could not be read. 327 */ 328 public static RawText load(ObjectLoader ldr, int threshold) throws IOException, BinaryBlobException { 329 long sz = ldr.getSize(); 330 331 if (sz > threshold) { 332 throw new BinaryBlobException(); 333 } 334 335 if (sz <= FIRST_FEW_BYTES) { 336 byte[] data = ldr.getCachedBytes(FIRST_FEW_BYTES); 337 if (isBinary(data)) { 338 throw new BinaryBlobException(); 339 } 340 return new RawText(data); 341 } 342 343 byte[] head = new byte[FIRST_FEW_BYTES]; 344 try (InputStream stream = ldr.openStream()) { 345 int off = 0; 346 int left = head.length; 347 while (left > 0) { 348 int n = stream.read(head, off, left); 349 if (n < 0) { 350 throw new EOFException(); 351 } 352 left -= n; 353 354 while (n > 0) { 355 if (head[off] == '\0') { 356 throw new BinaryBlobException(); 357 } 358 off++; 359 n--; 360 } 361 } 362 363 byte data[]; 364 try { 365 data = new byte[(int)sz]; 366 } catch (OutOfMemoryError e) { 367 throw new LargeObjectException.OutOfMemory(e); 368 } 369 370 System.arraycopy(head, 0, data, 0, head.length); 371 IO.readFully(stream, data, off, (int) (sz-off)); 372 return new RawText(data); 373 } 374 } 375 }