|
| 1 | +/* |
| 2 | + * LibXDiff by Davide Libenzi ( File Differential Library ) |
| 3 | + * Copyright (C) 2003 Davide Libenzi |
| 4 | + * |
| 5 | + * This library is free software; you can redistribute it and/or |
| 6 | + * modify it under the terms of the GNU Lesser General Public |
| 7 | + * License as published by the Free Software Foundation; either |
| 8 | + * version 2.1 of the License, or (at your option) any later version. |
| 9 | + * |
| 10 | + * This library is distributed in the hope that it will be useful, |
| 11 | + * but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 12 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 13 | + * Lesser General Public License for more details. |
| 14 | + * |
| 15 | + * You should have received a copy of the GNU Lesser General Public |
| 16 | + * License along with this library; if not, see |
| 17 | + * <http://www.gnu.org/licenses/>. |
| 18 | + * |
| 19 | + * Davide Libenzi <[email protected]> |
| 20 | + * |
| 21 | + * Ported to Java by Ned Twigg <[email protected]> on August 30 2019 |
| 22 | + */ |
| 23 | +package com.diffplug.libxdiff; |
| 24 | + |
| 25 | +import java.util.List; |
| 26 | + |
| 27 | +public class LibXDiff { |
| 28 | + private LibXDiff () {} |
| 29 | + |
| 30 | + public static split_score score(List<CharSequence> lines, int start, int end) { |
| 31 | + split_score score = new split_score(); |
| 32 | + score_add_split(measure_split(lines, end), score); |
| 33 | + score_add_split(measure_split(lines, start), score); |
| 34 | + return score; |
| 35 | + } |
| 36 | + |
| 37 | + /* |
| 38 | + * Fill m with information about a hypothetical split of xdf above line split. |
| 39 | + */ |
| 40 | + private static split_measurement measure_split(List<CharSequence> lines, int split) { |
| 41 | + split_measurement m = new split_measurement(); |
| 42 | + |
| 43 | + int i; |
| 44 | + |
| 45 | + if (split >= lines.size()) { |
| 46 | + m.end_of_file = 1; |
| 47 | + m.indent = -1; |
| 48 | + } else { |
| 49 | + m.end_of_file = 0; |
| 50 | + m.indent = get_indent(lines.get(split)); |
| 51 | + } |
| 52 | + |
| 53 | + m.pre_blank = 0; |
| 54 | + m.pre_indent = -1; |
| 55 | + for (i = split - 1; i >= 0; i--) { |
| 56 | + m.pre_indent = get_indent(lines.get(i)); |
| 57 | + if (m.pre_indent != -1) { |
| 58 | + break; |
| 59 | + } |
| 60 | + m.pre_blank += 1; |
| 61 | + if (m.pre_blank == MAX_BLANKS) { |
| 62 | + m.pre_indent = 0; |
| 63 | + break; |
| 64 | + } |
| 65 | + } |
| 66 | + |
| 67 | + m.post_blank = 0; |
| 68 | + m.post_indent = -1; |
| 69 | + for (i = split + 1; i < lines.size(); ++i) { |
| 70 | + m.post_indent = get_indent(lines.get(i)); |
| 71 | + if (m.post_indent != -1) { |
| 72 | + break; |
| 73 | + } |
| 74 | + m.post_blank += 1; |
| 75 | + if (m.post_blank == MAX_BLANKS) { |
| 76 | + m.post_indent = 0; |
| 77 | + break; |
| 78 | + } |
| 79 | + } |
| 80 | + |
| 81 | + return m; |
| 82 | + } |
| 83 | + |
| 84 | + /** |
| 85 | + * If more than this number of consecutive blank rows are found, just return this |
| 86 | + * value. This avoids requiring O(N^2) work for pathological cases, and also |
| 87 | + * ensures that the output of score_split fits in an int. |
| 88 | + */ |
| 89 | + private static final int MAX_BLANKS = 20; |
| 90 | + |
| 91 | + /** |
| 92 | + * Return the amount of indentation of the specified line, treating TAB as 8 |
| 93 | + * columns. Return -1 if line is empty or contains only whitespace. Clamp the |
| 94 | + * output value at MAX_INDENT. |
| 95 | + */ |
| 96 | + private static int get_indent(CharSequence line) { |
| 97 | + int i; |
| 98 | + int ret = 0; |
| 99 | + |
| 100 | + for (i = 0; i < line.length(); ++i) { |
| 101 | + char c = line.charAt(i); |
| 102 | + if (c == ' ') { |
| 103 | + ret += 1; |
| 104 | + } else if (c == '\t') { |
| 105 | + ret += 8 - ret % 8; |
| 106 | + } else { |
| 107 | + return ret; |
| 108 | + } |
| 109 | + /* ignore other whitespace characters */ |
| 110 | + if (ret >= MAX_INDENT) { |
| 111 | + return MAX_INDENT; |
| 112 | + } |
| 113 | + } |
| 114 | + |
| 115 | + /* The line contains only whitespace. */ |
| 116 | + return -1; |
| 117 | + } |
| 118 | + |
| 119 | + /** |
| 120 | + * If a line is indented more than this, get_indent() just returns this value. |
| 121 | + * This avoids having to do absurd amounts of work for data that are not |
| 122 | + * human-readable text, and also ensures that the output of get_indent fits within |
| 123 | + * an int. |
| 124 | + */ |
| 125 | + private static final int MAX_INDENT = 200; |
| 126 | + |
| 127 | + /** Characteristics measured about a hypothetical split position. */ |
| 128 | + private static class split_measurement { |
| 129 | + /** |
| 130 | + * Is the split at the end of the file (aside from any blank lines)? |
| 131 | + */ |
| 132 | + int end_of_file; |
| 133 | + |
| 134 | + /** |
| 135 | + * How much is the line immediately following the split indented (or -1 if |
| 136 | + * the line is blank): |
| 137 | + */ |
| 138 | + int indent; |
| 139 | + |
| 140 | + /** |
| 141 | + * How many consecutive lines above the split are blank? |
| 142 | + */ |
| 143 | + int pre_blank; |
| 144 | + |
| 145 | + /** |
| 146 | + * How much is the nearest non-blank line above the split indented (or -1 |
| 147 | + * if there is no such line)? |
| 148 | + */ |
| 149 | + int pre_indent; |
| 150 | + |
| 151 | + /** |
| 152 | + * How many lines after the line following the split are blank? |
| 153 | + */ |
| 154 | + int post_blank; |
| 155 | + |
| 156 | + /** |
| 157 | + * How much is the nearest non-blank line after the line following the |
| 158 | + * split indented (or -1 if there is no such line)? |
| 159 | + */ |
| 160 | + int post_indent; |
| 161 | + } |
| 162 | + |
| 163 | + public static class split_score { |
| 164 | + /** The effective indent of this split (smaller is preferred). */ |
| 165 | + int effective_indent; |
| 166 | + |
| 167 | + /** Penalty for this split (smaller is preferred). */ |
| 168 | + int penalty; |
| 169 | + } |
| 170 | + |
| 171 | + /* |
| 172 | + * The empirically-determined weight factors used by score_split() below. |
| 173 | + * Larger values means that the position is a less favorable place to split. |
| 174 | + * |
| 175 | + * Note that scores are only ever compared against each other, so multiplying |
| 176 | + * all of these weight/penalty values by the same factor wouldn't change the |
| 177 | + * heuristic's behavior. Still, we need to set that arbitrary scale *somehow*. |
| 178 | + * In practice, these numbers are chosen to be large enough that they can be |
| 179 | + * adjusted relative to each other with sufficient precision despite using |
| 180 | + * integer math. |
| 181 | + */ |
| 182 | + |
| 183 | + /** Penalty if there are no non-blank lines before the split */ |
| 184 | + private static final int START_OF_FILE_PENALTY = 1; |
| 185 | + |
| 186 | + /** Penalty if there are no non-blank lines after the split */ |
| 187 | + private static final int END_OF_FILE_PENALTY = 21; |
| 188 | + |
| 189 | + /** Multiplier for the number of blank lines around the split */ |
| 190 | + private static final int TOTAL_BLANK_WEIGHT = -30; |
| 191 | + |
| 192 | + /** Multiplier for the number of blank lines after the split */ |
| 193 | + private static final int POST_BLANK_WEIGHT = 6; |
| 194 | + |
| 195 | + /** |
| 196 | + * Penalties applied if the line is indented more than its predecessor |
| 197 | + */ |
| 198 | + private static final int RELATIVE_INDENT_PENALTY = -4; |
| 199 | + private static final int RELATIVE_INDENT_WITH_BLANK_PENALTY = 10; |
| 200 | + |
| 201 | + /** |
| 202 | + * Penalties applied if the line is indented less than both its predecessor and |
| 203 | + * its successor |
| 204 | + */ |
| 205 | + private static final int RELATIVE_OUTDENT_PENALTY = 24; |
| 206 | + private static final int RELATIVE_OUTDENT_WITH_BLANK_PENALTY = 17; |
| 207 | + |
| 208 | + /** |
| 209 | + * Penalties applied if the line is indented less than its predecessor but not |
| 210 | + * less than its successor |
| 211 | + */ |
| 212 | + private static final int RELATIVE_DEDENT_PENALTY = 23; |
| 213 | + private static final int RELATIVE_DEDENT_WITH_BLANK_PENALTY = 17; |
| 214 | + |
| 215 | + /** |
| 216 | + * We only consider whether the sum of the effective indents for splits are |
| 217 | + * less than (-1), equal to (0), or greater than (+1) each other. The resulting |
| 218 | + * value is multiplied by the following weight and combined with the penalty to |
| 219 | + * determine the better of two scores. |
| 220 | + */ |
| 221 | + private static final int INDENT_WEIGHT = 60; |
| 222 | + |
| 223 | + /** |
| 224 | + * How far do we slide a hunk at most? |
| 225 | + */ |
| 226 | + public static final int INDENT_HEURISTIC_MAX_SLIDING = 100; |
| 227 | + |
| 228 | + /** |
| 229 | + * Compute a badness score for the hypothetical split whose measurements are |
| 230 | + * stored in m. The weight factors were determined empirically using the tools and |
| 231 | + * corpus described in |
| 232 | + * |
| 233 | + * https://github.com/mhagger/diff-slider-tools |
| 234 | + * |
| 235 | + * Also see that project if you want to improve the weights based on, for example, |
| 236 | + * a larger or more diverse corpus. |
| 237 | + */ |
| 238 | + private static void score_add_split(split_measurement m, split_score s) { |
| 239 | + /* |
| 240 | + * A place to accumulate penalty factors (positive makes this index more |
| 241 | + * favored): |
| 242 | + */ |
| 243 | + int post_blank, total_blank, indent; |
| 244 | + boolean any_blanks; |
| 245 | + |
| 246 | + if (m.pre_indent == -1 && m.pre_blank == 0) { |
| 247 | + s.penalty += START_OF_FILE_PENALTY; |
| 248 | + } |
| 249 | + |
| 250 | + if (m.end_of_file == 1) { |
| 251 | + s.penalty += END_OF_FILE_PENALTY; |
| 252 | + } |
| 253 | + |
| 254 | + /* |
| 255 | + * Set post_blank to the number of blank lines following the split, |
| 256 | + * including the line immediately after the split: |
| 257 | + */ |
| 258 | + post_blank = (m.indent == -1) ? 1 + m.post_blank : 0; |
| 259 | + total_blank = m.pre_blank + post_blank; |
| 260 | + |
| 261 | + /* Penalties based on nearby blank lines: */ |
| 262 | + s.penalty += TOTAL_BLANK_WEIGHT * total_blank; |
| 263 | + s.penalty += POST_BLANK_WEIGHT * post_blank; |
| 264 | + |
| 265 | + if (m.indent != -1) { |
| 266 | + indent = m.indent; |
| 267 | + } else { |
| 268 | + indent = m.post_indent; |
| 269 | + } |
| 270 | + |
| 271 | + any_blanks = (total_blank != 0); |
| 272 | + |
| 273 | + /* Note that the effective indent is -1 at the end of the file: */ |
| 274 | + s.effective_indent += indent; |
| 275 | + |
| 276 | + if (indent == -1) { |
| 277 | + /* No additional adjustments needed. */ |
| 278 | + } else if (m.pre_indent == -1) { |
| 279 | + /* No additional adjustments needed. */ |
| 280 | + } else if (indent > m.pre_indent) { |
| 281 | + /* |
| 282 | + * The line is indented more than its predecessor. |
| 283 | + */ |
| 284 | + s.penalty += any_blanks ? RELATIVE_INDENT_WITH_BLANK_PENALTY : RELATIVE_INDENT_PENALTY; |
| 285 | + } else if (indent == m.pre_indent) { |
| 286 | + /* |
| 287 | + * The line has the same indentation level as its predecessor. |
| 288 | + * No additional adjustments needed. |
| 289 | + */ |
| 290 | + } else { |
| 291 | + /* |
| 292 | + * The line is indented less than its predecessor. It could be |
| 293 | + * the block terminator of the previous block, but it could |
| 294 | + * also be the start of a new block (e.g., an "else" block, or |
| 295 | + * maybe the previous block didn't have a block terminator). |
| 296 | + * Try to distinguish those cases based on what comes next: |
| 297 | + */ |
| 298 | + if (m.post_indent != -1 && m.post_indent > indent) { |
| 299 | + /* |
| 300 | + * The following line is indented more. So it is likely |
| 301 | + * that this line is the start of a block. |
| 302 | + */ |
| 303 | + s.penalty += any_blanks ? RELATIVE_OUTDENT_WITH_BLANK_PENALTY : RELATIVE_OUTDENT_PENALTY; |
| 304 | + } else { |
| 305 | + /* |
| 306 | + * That was probably the end of a block. |
| 307 | + */ |
| 308 | + s.penalty += any_blanks ? RELATIVE_DEDENT_WITH_BLANK_PENALTY : RELATIVE_DEDENT_PENALTY; |
| 309 | + } |
| 310 | + } |
| 311 | + } |
| 312 | + |
| 313 | + public static int score_cmp(split_score s1, split_score s2) { |
| 314 | + /* -1 if s1.effective_indent < s2->effective_indent, etc. */ |
| 315 | + int cmp_indents = ((s1.effective_indent > s2.effective_indent ? 1 : 0) - |
| 316 | + (s1.effective_indent < s2.effective_indent ? 1 : 0)); |
| 317 | + |
| 318 | + return INDENT_WEIGHT * cmp_indents + (s1.penalty - s2.penalty); |
| 319 | + } |
| 320 | +} |
0 commit comments