caio.co/de/caca


Cleanup debug code by Caio 9 months ago (log)

Blob urso/src/rename/mod.rs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
use gix::{
    bstr::{BStr, ByteSlice},
    object::tree::diff::{change::Event, Action},
    ObjectId, Tree,
};

use std::path::PathBuf;

use crate::error::{wrap_err, WrappedError};

pub(crate) enum RenameError<E> {
    Repo(E),
    Wrapped(WrappedError),
}

impl<E> From<WrappedError> for RenameError<E> {
    fn from(value: WrappedError) -> Self {
        RenameError::Wrapped(value)
    }
}

// Find out if `path` was created via renaming a path
// from the `parent` tree
//
// Takes a path that exists in the current tree and does
// NOT exist in the parent
//
// Yields back the path and object id of a blob in the
// parent tree that is most similar to the one pointed
// at by the input path
pub(crate) fn find_rename<R, E>(
    repo: R,
    id: ObjectId,
    current: &Tree<'_>,
    parent: &Tree<'_>,
) -> Result<Option<(PathBuf, ObjectId)>, RenameError<E>>
where
    R: Repo<Error = E>,
{
    let mut candidates = Vec::new();
    let target_header = repo.get_header(id).map_err(RenameError::Repo)?;

    // stop conditions:
    // - error
    // - identity found
    let mut map_err = None;
    let mut by_identity = None;

    map_deleted_blobs(current, parent, |location, id| -> bool {
        if target_header.id == id {
            // trivial rename: path changed, blob content didn't
            by_identity = Some(gix::path::from_bstr(location).into_owned());
            return false;
        }
        match repo.get_header(id) {
            Ok(header) => {
                // pruning candidates set:
                // since similarity is based on the byte length of the
                // diff, the total sizes can be used to figure out
                // if there's even a chance they'll be similar enough
                let max = target_header.size.max(header.size) as f32;
                let delta = target_header.size.abs_diff(header.size) as f32;
                if max * repo.min_similarity() >= delta {
                    // XXX should sniff the bytes prior to mime-based filtering
                    let path = location.to_path_lossy();
                    let (guessed, is_text) =
                        crate::mime::guess_from_path(&path).unwrap_or((crate::mime::BINARY, false));
                    if is_text {
                        candidates.push((path.into_owned(), header));
                    } else {
                        tracing::trace!(
                            path = tracing::field::debug(location),
                            mime = guessed,
                            "skipped: not text"
                        );
                    }
                } else {
                    tracing::trace!(
                        path = tracing::field::debug(location),
                        "skipped: size difference too large"
                    );
                }
                true
            }
            Err(e) => {
                map_err = Some(e);
                false
            }
        }
    })?;

    if let Some(err) = map_err {
        return Err(RenameError::Repo(err));
    }

    if let Some(found) = by_identity {
        return Ok(Some((found, id)));
    }

    if candidates.is_empty() {
        return Ok(None);
    }

    // The set of candidates is potentially massive,
    // checking the similarity of every single one would
    // be too costly so i'll use a heuristic that tries
    // to look at the most likely candidates first and
    // stops as soon as it finds one that's similar enough.
    //
    // Worst case is when a rename can't be found.
    //
    // Since byte delta is the criteria, prioritize
    // blobs with similar length to the target.
    // Break even with the path, in order to guarantee
    // output stability
    candidates.sort_unstable_by(|a, b| {
        a.1.size
            .abs_diff(target_header.size)
            .cmp(&b.1.size.abs_diff(target_header.size))
            .then_with(|| a.1.id.cmp(&b.1.id))
    });
    tracing::trace!(
        candidates = tracing::field::debug(&candidates),
        "rename candidates"
    );

    for (path, header) in candidates {
        let similarity = repo.similarity(id, header.id).map_err(RenameError::Repo)?;
        if similarity >= repo.min_similarity() {
            return Ok(Some((path, header.id)));
        } else {
            tracing::trace!(
                path = tracing::field::debug(&path),
                threshold = repo.min_similarity(),
                similarity,
                "not similar enough"
            );
        }
    }

    Ok(None)
}

pub(crate) trait Repo {
    type Error;

    // FIXME ensure within 0..=1
    fn min_similarity(&self) -> f32;

    fn get_header(&self, id: ObjectId) -> std::result::Result<crate::diff::Header, Self::Error>;

    fn similarity(&self, prev: ObjectId, cur: ObjectId) -> std::result::Result<f32, Self::Error>;
}

// maps every blob that would be removed when transforming
// `parent` into `current`
fn map_deleted_blobs<F>(
    current: &Tree<'_>,
    parent: &Tree<'_>,
    mut cb: F,
) -> Result<(), WrappedError>
where
    F: FnMut(&BStr, ObjectId) -> bool,
{
    let outcome = parent
        .changes()
        .map_err(|e| {
            wrap_err(
                format!("preparing to diff tree {} vs {}", parent.id, current.id,),
                e,
            )
        })?
        .track_path()
        .track_rewrites(None)
        .for_each_to_obtain_tree(current, |change| -> std::result::Result<_, WrappedError> {
            if let Event::Deletion { entry_mode, id } = change.event {
                if entry_mode.is_blob() && !cb(change.location, id.detach()) {
                    return Ok(Action::Cancel);
                }
            };
            Ok(Action::Continue)
        });

    match outcome {
        // If comparing the trees yields no error or gets cancelled
        // manually, everything went fine
        Ok(_)
        | Err(gix::object::tree::diff::for_each::Error::Diff(
            gix::diff::tree::changes::Error::Cancelled,
        )) => Ok(()),
        Err(e) => Err(wrap_err(
            format!(
                "walking diff between trees {} and {}",
                current.id, parent.id
            ),
            e,
        )),
    }
}