Blob urso/src/rename/mod.rs
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 |
use gix::{ bstr::{BStr, ByteSlice}, object::tree::diff::{change::Event, Action}, ObjectId, Tree, }; use std::path::PathBuf; use crate::error::{wrap_err, WrappedError}; pub(crate) enum RenameError<E> { Repo(E), Wrapped(WrappedError), } impl<E> From<WrappedError> for RenameError<E> { fn from(value: WrappedError) -> Self { RenameError::Wrapped(value) } } // Find out if `path` was created via renaming a path // from the `parent` tree // // Takes a path that exists in the current tree and does // NOT exist in the parent // // Yields back the path and object id of a blob in the // parent tree that is most similar to the one pointed // at by the input path pub(crate) fn find_rename<R, E>( repo: R, id: ObjectId, current: &Tree<'_>, parent: &Tree<'_>, ) -> Result<Option<(PathBuf, ObjectId)>, RenameError<E>> where R: Repo<Error = E>, { let mut candidates = Vec::new(); let target_header = repo.get_header(id).map_err(RenameError::Repo)?; // stop conditions: // - error // - identity found let mut map_err = None; let mut by_identity = None; map_deleted_blobs(current, parent, |location, id| -> bool { if target_header.id == id { // trivial rename: path changed, blob content didn't by_identity = Some(gix::path::from_bstr(location).into_owned()); return false; } match repo.get_header(id) { Ok(header) => { // pruning candidates set: // since similarity is based on the byte length of the // diff, the total sizes can be used to figure out // if there's even a chance they'll be similar enough let max = target_header.size.max(header.size) as f32; let delta = target_header.size.abs_diff(header.size) as f32; if max * repo.min_similarity() >= delta { // XXX should sniff the bytes prior to mime-based filtering let path = location.to_path_lossy(); let (guessed, is_text) = crate::mime::guess_from_path(&path).unwrap_or((crate::mime::BINARY, false)); if is_text { candidates.push((path.into_owned(), header)); } else { tracing::trace!( path = tracing::field::debug(location), mime = guessed, "skipped: not text" ); } } else { tracing::trace!( path = tracing::field::debug(location), "skipped: size difference too large" ); } true } Err(e) => { map_err = Some(e); false } } })?; if let Some(err) = map_err { return Err(RenameError::Repo(err)); } if let Some(found) = by_identity { return Ok(Some((found, id))); } if candidates.is_empty() { return Ok(None); } // The set of candidates is potentially massive, // checking the similarity of every single one would // be too costly so i'll use a heuristic that tries // to look at the most likely candidates first and // stops as soon as it finds one that's similar enough. // // Worst case is when a rename can't be found. // // Since byte delta is the criteria, prioritize // blobs with similar length to the target. // Break even with the path, in order to guarantee // output stability candidates.sort_unstable_by(|a, b| { a.1.size .abs_diff(target_header.size) .cmp(&b.1.size.abs_diff(target_header.size)) .then_with(|| a.1.id.cmp(&b.1.id)) }); tracing::trace!( candidates = tracing::field::debug(&candidates), "rename candidates" ); for (path, header) in candidates { let similarity = repo.similarity(id, header.id).map_err(RenameError::Repo)?; if similarity >= repo.min_similarity() { return Ok(Some((path, header.id))); } else { tracing::trace!( path = tracing::field::debug(&path), threshold = repo.min_similarity(), similarity, "not similar enough" ); } } Ok(None) } pub(crate) trait Repo { type Error; // FIXME ensure within 0..=1 fn min_similarity(&self) -> f32; fn get_header(&self, id: ObjectId) -> std::result::Result<crate::diff::Header, Self::Error>; fn similarity(&self, prev: ObjectId, cur: ObjectId) -> std::result::Result<f32, Self::Error>; } // maps every blob that would be removed when transforming // `parent` into `current` fn map_deleted_blobs<F>( current: &Tree<'_>, parent: &Tree<'_>, mut cb: F, ) -> Result<(), WrappedError> where F: FnMut(&BStr, ObjectId) -> bool, { let outcome = parent .changes() .map_err(|e| { wrap_err( format!("preparing to diff tree {} vs {}", parent.id, current.id,), e, ) })? .track_path() .track_rewrites(None) .for_each_to_obtain_tree(current, |change| -> std::result::Result<_, WrappedError> { if let Event::Deletion { entry_mode, id } = change.event { if entry_mode.is_blob() && !cb(change.location, id.detach()) { return Ok(Action::Cancel); } }; Ok(Action::Continue) }); match outcome { // If comparing the trees yields no error or gets cancelled // manually, everything went fine Ok(_) | Err(gix::object::tree::diff::for_each::Error::Diff( gix::diff::tree::changes::Error::Cancelled, )) => Ok(()), Err(e) => Err(wrap_err( format!( "walking diff between trees {} and {}", current.id, parent.id ), e, )), } } |