1 /* 2 * Copyright (C) 2009, Google Inc. 3 * Copyright (C) 2008-2009, Johannes E. Schindelin <johannes.schindelin@gmx.de> 4 * and other copyright owners as documented in the project's IP log. 5 * 6 * This program and the accompanying materials are made available 7 * under the terms of the Eclipse Distribution License v1.0 which 8 * accompanies this distribution, is reproduced below, and is 9 * available at http://www.eclipse.org/org/documents/edl-v10.php 10 * 11 * All rights reserved. 12 * 13 * Redistribution and use in source and binary forms, with or 14 * without modification, are permitted provided that the following 15 * conditions are met: 16 * 17 * - Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials provided 23 * with the distribution. 24 * 25 * - Neither the name of the Eclipse Foundation, Inc. nor the 26 * names of its contributors may be used to endorse or promote 27 * products derived from this software without specific prior 28 * written permission. 29 * 30 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND 31 * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, 32 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 33 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 34 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 35 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 36 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 37 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 38 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 39 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, 40 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 41 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 42 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 43 */ 44 45 package org.eclipse.jgit.diff; 46 47 import java.io.File; 48 import java.io.IOException; 49 import java.io.InputStream; 50 import java.io.OutputStream; 51 52 import org.eclipse.jgit.util.IO; 53 import org.eclipse.jgit.util.IntList; 54 import org.eclipse.jgit.util.RawParseUtils; 55 56 /** 57 * A Sequence supporting UNIX formatted text in byte[] format. 58 * <p> 59 * Elements of the sequence are the lines of the file, as delimited by the UNIX 60 * newline character ('\n'). The file content is treated as 8 bit binary text, 61 * with no assumptions or requirements on character encoding. 62 * <p> 63 * Note that the first line of the file is element 0, as defined by the Sequence 64 * interface API. Traditionally in a text editor a patch file the first line is 65 * line number 1. Callers may need to subtract 1 prior to invoking methods if 66 * they are converting from "line number" to "element index". 67 */ 68 public class RawText extends Sequence { 69 /** A Rawtext of length 0 */ 70 public static final RawText EMPTY_TEXT = new RawText(new byte[0]); 71 72 /** Number of bytes to check for heuristics in {@link #isBinary(byte[])} */ 73 private static final int FIRST_FEW_BYTES = 8000; 74 75 /** The file content for this sequence. */ 76 protected final byte[] content; 77 78 /** Map of line number to starting position within {@link #content}. */ 79 protected final IntList lines; 80 81 /** 82 * Create a new sequence from an existing content byte array. 83 * <p> 84 * The entire array (indexes 0 through length-1) is used as the content. 85 * 86 * @param input 87 * the content array. The array is never modified, so passing 88 * through cached arrays is safe. 89 */ 90 public RawText(final byte[] input) { 91 content = input; 92 lines = RawParseUtils.lineMap(content, 0, content.length); 93 } 94 95 /** 96 * Create a new sequence from a file. 97 * <p> 98 * The entire file contents are used. 99 * 100 * @param file 101 * the text file. 102 * @throws IOException 103 * if Exceptions occur while reading the file 104 */ 105 public RawText(File file) throws IOException { 106 this(IO.readFully(file)); 107 } 108 109 /** @return total number of items in the sequence. */ 110 @Override 111 public int size() { 112 // The line map is always 2 entries larger than the number of lines in 113 // the file. Index 0 is padded out/unused. The last index is the total 114 // length of the buffer, and acts as a sentinel. 115 // 116 return lines.size() - 2; 117 } 118 119 /** 120 * Write a specific line to the output stream, without its trailing LF. 121 * <p> 122 * The specified line is copied as-is, with no character encoding 123 * translation performed. 124 * <p> 125 * If the specified line ends with an LF ('\n'), the LF is <b>not</b> 126 * copied. It is up to the caller to write the LF, if desired, between 127 * output lines. 128 * 129 * @param out 130 * stream to copy the line data onto. 131 * @param i 132 * index of the line to extract. Note this is 0-based, so line 133 * number 1 is actually index 0. 134 * @throws IOException 135 * the stream write operation failed. 136 */ 137 public void writeLine(final OutputStream out, final int i) 138 throws IOException { 139 int start = getStart(i); 140 int end = getEnd(i); 141 if (content[end - 1] == '\n') 142 end--; 143 out.write(content, start, end - start); 144 } 145 146 /** 147 * Determine if the file ends with a LF ('\n'). 148 * 149 * @return true if the last line has an LF; false otherwise. 150 */ 151 public boolean isMissingNewlineAtEnd() { 152 final int end = lines.get(lines.size() - 1); 153 if (end == 0) 154 return true; 155 return content[end - 1] != '\n'; 156 } 157 158 /** 159 * Get the text for a single line. 160 * 161 * @param i 162 * index of the line to extract. Note this is 0-based, so line 163 * number 1 is actually index 0. 164 * @return the text for the line, without a trailing LF. 165 */ 166 public String getString(int i) { 167 return getString(i, i + 1, true); 168 } 169 170 /** 171 * Get the text for a region of lines. 172 * 173 * @param begin 174 * index of the first line to extract. Note this is 0-based, so 175 * line number 1 is actually index 0. 176 * @param end 177 * index of one past the last line to extract. 178 * @param dropLF 179 * if true the trailing LF ('\n') of the last returned line is 180 * dropped, if present. 181 * @return the text for lines {@code [begin, end)}. 182 */ 183 public String getString(int begin, int end, boolean dropLF) { 184 if (begin == end) 185 return ""; //$NON-NLS-1$ 186 187 int s = getStart(begin); 188 int e = getEnd(end - 1); 189 if (dropLF && content[e - 1] == '\n') 190 e--; 191 return decode(s, e); 192 } 193 194 /** 195 * Decode a region of the text into a String. 196 * 197 * The default implementation of this method tries to guess the character 198 * set by considering UTF-8, the platform default, and falling back on 199 * ISO-8859-1 if neither of those can correctly decode the region given. 200 * 201 * @param start 202 * first byte of the content to decode. 203 * @param end 204 * one past the last byte of the content to decode. 205 * @return the region {@code [start, end)} decoded as a String. 206 */ 207 protected String decode(int start, int end) { 208 return RawParseUtils.decode(content, start, end); 209 } 210 211 private int getStart(final int i) { 212 return lines.get(i + 1); 213 } 214 215 private int getEnd(final int i) { 216 return lines.get(i + 2); 217 } 218 219 /** 220 * Determine heuristically whether a byte array represents binary (as 221 * opposed to text) content. 222 * 223 * @param raw 224 * the raw file content. 225 * @return true if raw is likely to be a binary file, false otherwise 226 */ 227 public static boolean isBinary(byte[] raw) { 228 return isBinary(raw, raw.length); 229 } 230 231 /** 232 * Determine heuristically whether the bytes contained in a stream 233 * represents binary (as opposed to text) content. 234 * 235 * Note: Do not further use this stream after having called this method! The 236 * stream may not be fully read and will be left at an unknown position 237 * after consuming an unknown number of bytes. The caller is responsible for 238 * closing the stream. 239 * 240 * @param raw 241 * input stream containing the raw file content. 242 * @return true if raw is likely to be a binary file, false otherwise 243 * @throws IOException 244 * if input stream could not be read 245 */ 246 public static boolean isBinary(InputStream raw) throws IOException { 247 final byte[] buffer = new byte[FIRST_FEW_BYTES]; 248 int cnt = 0; 249 while (cnt < buffer.length) { 250 final int n = raw.read(buffer, cnt, buffer.length - cnt); 251 if (n == -1) 252 break; 253 cnt += n; 254 } 255 return isBinary(buffer, cnt); 256 } 257 258 /** 259 * Determine heuristically whether a byte array represents binary (as 260 * opposed to text) content. 261 * 262 * @param raw 263 * the raw file content. 264 * @param length 265 * number of bytes in {@code raw} to evaluate. This should be 266 * {@code raw.length} unless {@code raw} was over-allocated by 267 * the caller. 268 * @return true if raw is likely to be a binary file, false otherwise 269 */ 270 public static boolean isBinary(byte[] raw, int length) { 271 // Same heuristic as C Git 272 if (length > FIRST_FEW_BYTES) 273 length = FIRST_FEW_BYTES; 274 for (int ptr = 0; ptr < length; ptr++) 275 if (raw[ptr] == '\0') 276 return true; 277 278 return false; 279 } 280 281 /** 282 * Get the line delimiter for the first line. 283 * 284 * @since 2.0 285 * @return the line delimiter or <code>null</code> 286 */ 287 public String getLineDelimiter() { 288 if (size() == 0) 289 return null; 290 int e = getEnd(0); 291 if (content[e - 1] != '\n') 292 return null; 293 if (content.length > 1 && e > 1 && content[e - 2] == '\r') 294 return "\r\n"; //$NON-NLS-1$ 295 else 296 return "\n"; //$NON-NLS-1$ 297 } 298 }