Skip to content

Commit 94dac4b

Browse files
cruesslerByron
authored andcommitted
Add script to generate test-cases for gix-diff to validate that diff-algorithms match (diff-slider problem)
We assume that most people would expect the output of `gix-diff` to be identical to that of `git diff`. Therefore, we want an easy way to compare the output of `gix-diff` and `git diff` on a large corpus of diffs to make sure they match. This commit tries to make as much of that process scripted and as easily reproducible as possible. There is a repository that contains “\[t\]ools for experimenting \[with\] diff "slider" heuristics”: [diff-slider-tools]. We can use `diff-slider-tools` to, for any git repository, generate a list of locations where a diff between two files is not unambiguous. This commit creates a tool that takes this a list of locations generated by `diff-slider-tools` and turns it into test cases that can be run as part of `gix-diff`’s test suite. This enables us to, whenever we want, run large-scale tests comparing `gix-diff` to `gix diff`, hopefully uncovering any edge case we might have missed in our slider heuristic and making sure we conform to our users’ expectations. [diff-slider-tools]: https://github.com/mhagger/diff-slider-tools 1. Follow these instructions to generate a file containing sliders: https://github.com/mhagger/diff-slider-tools/blob/b59ed13d7a2a6cfe14a8f79d434b6221cc8b04dd/README.md?plain=1#L122-L146 2. Run `create-diff-cases` to create the script in `gix-diff/tests/fixtures/`. The script which will be called `make_diff_for_sliders_repo.sh`. ``` # run inside `gitoxide` cargo run --package internal-tools -- create-diff-cases --sliders-file $DIFF_SLIDER_TOOLS/corpus/git.sliders --worktree-dir $DIFF_SLIDER_TOOLS/corpus/git.git/ --destination-dir gix-diff/tests/fixtures/ ``` 3. Run `cargo test sliders -- --nocapture` inside `gix-diff/tests` to run the actual tests.
1 parent 0a576b9 commit 94dac4b

File tree

8 files changed

+433
-1
lines changed

8 files changed

+433
-1
lines changed

gix-diff/src/blob/unified_diff/impls.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,7 +260,8 @@ where
260260
}
261261

262262
impl DiffLineKind {
263-
const fn to_prefix(self) -> char {
263+
/// Returns a one-character representation for use in unified diffs.
264+
pub const fn to_prefix(self) -> char {
264265
match self {
265266
DiffLineKind::Context => ' ',
266267
DiffLineKind::Add => '+',

gix-diff/tests/diff/blob/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
pub(crate) mod pipeline;
22
mod platform;
3+
mod slider;
34
mod unified_diff;

gix-diff/tests/diff/blob/slider.rs

Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
use std::{iter::Peekable, path::PathBuf};
2+
3+
use gix_diff::blob::{
4+
intern::TokenSource,
5+
unified_diff::{ConsumeHunk, ContextSize, HunkHeader},
6+
Algorithm, UnifiedDiff,
7+
};
8+
use gix_object::bstr::{self, BString, ByteVec};
9+
10+
#[derive(Debug, PartialEq)]
11+
struct DiffHunk {
12+
header: HunkHeader,
13+
lines: BString,
14+
}
15+
16+
struct DiffHunkRecorder {
17+
inner: Vec<DiffHunk>,
18+
}
19+
20+
impl DiffHunkRecorder {
21+
fn new() -> Self {
22+
Self { inner: Vec::new() }
23+
}
24+
}
25+
26+
impl ConsumeHunk for DiffHunkRecorder {
27+
type Out = Vec<DiffHunk>;
28+
29+
fn consume_hunk(
30+
&mut self,
31+
header: HunkHeader,
32+
lines: &[(gix_diff::blob::unified_diff::DiffLineKind, &[u8])],
33+
) -> std::io::Result<()> {
34+
let mut buf = Vec::new();
35+
36+
for &(kind, line) in lines {
37+
buf.push(kind.to_prefix() as u8);
38+
buf.extend_from_slice(line);
39+
buf.push(b'\n');
40+
}
41+
42+
let diff_hunk = DiffHunk {
43+
header,
44+
lines: buf.into(),
45+
};
46+
47+
self.inner.push(diff_hunk);
48+
49+
Ok(())
50+
}
51+
52+
fn finish(self) -> Self::Out {
53+
self.inner
54+
}
55+
}
56+
57+
struct Baseline<'a> {
58+
lines: Peekable<bstr::Lines<'a>>,
59+
}
60+
61+
mod baseline {
62+
use std::path::Path;
63+
64+
use gix_diff::blob::unified_diff::HunkHeader;
65+
use gix_object::bstr::ByteSlice;
66+
67+
use super::{Baseline, DiffHunk};
68+
69+
static START_OF_HEADER: &[u8; 4] = b"@@ -";
70+
71+
impl Baseline<'_> {
72+
pub fn collect(baseline_path: impl AsRef<Path>) -> std::io::Result<Vec<DiffHunk>> {
73+
let content = std::fs::read(baseline_path)?;
74+
75+
let mut baseline = Baseline {
76+
lines: content.lines().peekable(),
77+
};
78+
79+
baseline.skip_header();
80+
81+
Ok(baseline.collect())
82+
}
83+
84+
fn skip_header(&mut self) {
85+
// diff --git a/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa b/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
86+
// index ccccccc..ddddddd 100644
87+
// --- a/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
88+
// +++ b/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb
89+
90+
let line = self.lines.next().expect("line to be present");
91+
assert!(line.starts_with(b"diff --git "));
92+
93+
let line = self.lines.next().expect("line to be present");
94+
assert!(line.starts_with(b"index "));
95+
96+
let line = self.lines.next().expect("line to be present");
97+
assert!(line.starts_with(b"--- "));
98+
99+
let line = self.lines.next().expect("line to be present");
100+
assert!(line.starts_with(b"+++ "));
101+
}
102+
103+
/// Parse diff hunk headers that conform to the unified diff hunk header format.
104+
///
105+
/// The parser is very primitive and relies on the fact that `+18` is parsed as `18`. This
106+
/// allows us to split the input on ` ` and `,` only.
107+
///
108+
/// @@ -18,6 +18,7 @@ abc def ghi
109+
/// @@ -{before_hunk_start},{before_hunk_len} +{after_hunk_start},{after_hunk_len} @@
110+
fn parse_hunk_header(&self, line: &[u8]) -> gix_testtools::Result<HunkHeader> {
111+
let Some(line) = line.strip_prefix(START_OF_HEADER) else {
112+
todo!()
113+
};
114+
115+
let parts: Vec<_> = line.split(|b| *b == b' ' || *b == b',').collect();
116+
let [before_hunk_start, before_hunk_len, after_hunk_start, after_hunk_len, ..] = parts[..] else {
117+
todo!()
118+
};
119+
120+
Ok(HunkHeader {
121+
before_hunk_start: self.parse_number(before_hunk_start),
122+
before_hunk_len: self.parse_number(before_hunk_len),
123+
after_hunk_start: self.parse_number(after_hunk_start),
124+
after_hunk_len: self.parse_number(after_hunk_len),
125+
})
126+
}
127+
128+
fn parse_number(&self, bytes: &[u8]) -> u32 {
129+
bytes
130+
.to_str()
131+
.expect("to be a valid UTF-8 string")
132+
.parse::<u32>()
133+
.expect("to be a number")
134+
}
135+
}
136+
137+
impl Iterator for Baseline<'_> {
138+
type Item = DiffHunk;
139+
140+
fn next(&mut self) -> Option<Self::Item> {
141+
let mut hunk_header = None;
142+
let mut hunk_lines = Vec::new();
143+
144+
while let Some(line) = self.lines.next() {
145+
if line.starts_with(START_OF_HEADER) {
146+
assert!(hunk_header.is_none(), "should not overwrite existing hunk_header");
147+
hunk_header = self.parse_hunk_header(line).ok();
148+
149+
continue;
150+
}
151+
152+
match line[0] {
153+
b' ' | b'+' | b'-' => {
154+
hunk_lines.extend_from_slice(line);
155+
hunk_lines.push(b'\n');
156+
}
157+
_ => todo!(),
158+
}
159+
160+
match self.lines.peek() {
161+
Some(next_line) if next_line.starts_with(START_OF_HEADER) => break,
162+
None => break,
163+
_ => {}
164+
}
165+
}
166+
167+
hunk_header.map(|hunk_header| DiffHunk {
168+
header: hunk_header,
169+
lines: hunk_lines.into(),
170+
})
171+
}
172+
}
173+
}
174+
175+
#[test]
176+
fn sliders() -> gix_testtools::Result {
177+
let worktree_path = fixture_path()?;
178+
let asset_dir = worktree_path.join("assets");
179+
180+
let dir = std::fs::read_dir(&worktree_path)?;
181+
182+
for entry in dir {
183+
let entry = entry?;
184+
let file_name = entry.file_name().into_string().expect("to be string");
185+
186+
if !file_name.ends_with(".baseline") {
187+
continue;
188+
}
189+
190+
let parts: Vec<_> = file_name.split('.').collect();
191+
let [name, algorithm, ..] = parts[..] else {
192+
unimplemented!()
193+
};
194+
let algorithm = match algorithm {
195+
"myers" => Algorithm::Myers,
196+
"histogram" => Algorithm::Histogram,
197+
_ => unimplemented!(),
198+
};
199+
200+
let parts: Vec<_> = name.split('-').collect();
201+
let [old_blob_id, new_blob_id] = parts[..] else {
202+
unimplemented!();
203+
};
204+
205+
let old_data = std::fs::read(asset_dir.join(format!("{old_blob_id}.blob")))?;
206+
let new_data = std::fs::read(asset_dir.join(format!("{new_blob_id}.blob")))?;
207+
208+
let interner = gix_diff::blob::intern::InternedInput::new(
209+
tokens_for_diffing(old_data.as_slice()),
210+
tokens_for_diffing(new_data.as_slice()),
211+
);
212+
213+
let actual = gix_diff::blob::diff(
214+
algorithm,
215+
&interner,
216+
UnifiedDiff::new(&interner, DiffHunkRecorder::new(), ContextSize::symmetrical(3)),
217+
)?;
218+
219+
let baseline_path = worktree_path.join(file_name);
220+
let baseline = Baseline::collect(baseline_path).unwrap();
221+
222+
let actual = actual
223+
.iter()
224+
.fold(BString::default(), |mut acc, diff_hunk| {
225+
acc.push_str(diff_hunk.header.to_string().as_str());
226+
acc.push(b'\n');
227+
228+
acc.extend_from_slice(&diff_hunk.lines);
229+
230+
acc
231+
})
232+
.to_string();
233+
234+
let baseline = baseline
235+
.iter()
236+
.fold(BString::default(), |mut acc, diff_hunk| {
237+
acc.push_str(diff_hunk.header.to_string().as_str());
238+
acc.push(b'\n');
239+
240+
acc.extend_from_slice(&diff_hunk.lines);
241+
242+
acc
243+
})
244+
.to_string();
245+
246+
pretty_assertions::assert_eq!(actual, baseline);
247+
}
248+
249+
Ok(())
250+
}
251+
252+
fn tokens_for_diffing(data: &[u8]) -> impl TokenSource<Token = &[u8]> {
253+
gix_diff::blob::sources::byte_lines(data)
254+
}
255+
256+
fn fixture_path() -> gix_testtools::Result<PathBuf> {
257+
gix_testtools::scripted_fixture_read_only_standalone("make_diff_for_sliders_repo.sh")
258+
}
Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
#!/usr/bin/env bash
2+
set -eu -o pipefail
3+
4+
# This script is a placeholder for a script generated by the
5+
# `create-diff-cases` subcommand of `internal-tools`. The placeholder does
6+
# nothing except making the associated test trivially pass.
7+
#
8+
# ## Usage
9+
#
10+
# Follow these instructions to generate a file containing sliders:
11+
# https://github.com/mhagger/diff-slider-tools/blob/b59ed13d7a2a6cfe14a8f79d434b6221cc8b04dd/README.md?plain=1#L122-L146
12+
#
13+
# Run `create-diff-cases` to create the script in `gix-diff/tests/fixtures/`.
14+
# The script will be called `make_diff_for_sliders_repo.sh` and it will
15+
# overwrite the placeholder.
16+
#
17+
# ```
18+
# # run inside `gitoxide`
19+
# cargo run --package internal-tools -- create-diff-cases --sliders-file $PATH_TO_DIFF_SLIDER_TOOLS/corpus/git.sliders --worktree-dir $PATH_TO_DIFF_SLIDER_TOOLS/corpus/git.git/ --destination-dir gix-diff/tests/fixtures/
20+
# ```
21+
#
22+
# Run `cargo test sliders -- --nocapture` inside `gix-diff/tests` to run the
23+
# actual tests.

tests/it/src/args.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,34 @@ pub enum Subcommands {
123123
#[clap(value_parser = AsPathSpec)]
124124
patterns: Vec<gix::pathspec::Pattern>,
125125
},
126+
/// Take a slider file generated with the help of [diff-slider-tools] and turn it into a series
127+
/// of baseline diffs to be used in [slider-rs].
128+
///
129+
/// See [make-diff-for-sliders-repo] for details.
130+
///
131+
/// [diff-slider-tools]: https://github.com/mhagger/diff-slider-tools
132+
/// [slider-rs]: gix-diff/tests/diff/blob/slider.rs
133+
/// [make-diff-for-sliders-repo]: gix-diff/tests/fixtures/make_diff_for_sliders_repo.sh
134+
CreateDiffCases {
135+
/// Don't really copy anything.
136+
#[clap(long, short = 'n')]
137+
dry_run: bool,
138+
/// The `.sliders` file that contains a list of sliders.
139+
#[clap(long)]
140+
sliders_file: PathBuf,
141+
/// The git root to extract the diff-related parts from.
142+
#[clap(long)]
143+
worktree_dir: PathBuf,
144+
/// The directory into which to copy the files.
145+
#[clap(long)]
146+
destination_dir: PathBuf,
147+
/// The number of sliders to generate test cases for.
148+
#[clap(long, default_value_t = 10)]
149+
count: usize,
150+
/// The directory to place assets in.
151+
#[clap(long)]
152+
asset_dir: Option<BString>,
153+
},
126154
/// Check for executable bits that disagree with shebangs.
127155
///
128156
/// This checks committed and staged files, but not anything unstaged, to find shell scripts

0 commit comments

Comments
 (0)