Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve performance of VCS diff listing #343

Merged
merged 1 commit into from
Feb 18, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/atlaspack_vcs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ anyhow = { workspace = true }
git2 = { workspace = true }
hex = { workspace = true }
nom = { workspace = true }
rayon = { workspace = true }
serde = { workspace = true, features = ["derive"] }
serde_yaml = { workspace = true }
tracing = { workspace = true }
Expand Down
158 changes: 85 additions & 73 deletions crates/atlaspack_vcs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@
use anyhow::anyhow;
use git2::{DiffOptions, Repository};
use rayon::prelude::*;
use serde::{Deserialize, Serialize};
use std::{
hash::{Hash, Hasher},
Expand Down Expand Up @@ -148,11 +149,15 @@ impl VCSState {
exclude_patterns: &[String],
failure_mode: FailureMode,
) -> anyhow::Result<VCSState> {
tracing::info!("Reading VCS state");
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for my understanding, why do we refer the "full path" rather than doing use tracing::{debug, info, error}; so that we can avoid repeating tracing:: ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't thought too much about it but I generally have used the full path because error, info, etc. are common names that may be shadowed.

let repo = Repository::open(path)?;
let head = repo.revparse_single("HEAD")?.peel_to_commit()?;
let git_hash = head.id().to_string();
tracing::info!("Found head commit");
let file_listing = vcs_list_dirty_files(path, exclude_patterns)?;
tracing::info!("Listed dirty files");
let yarn_states = list_yarn_states(path, failure_mode)?;
tracing::info!("Listed yarn states");

Ok(VCSState {
git_hash,
Expand All @@ -166,46 +171,49 @@ pub fn list_yarn_states(
repo: &Path,
failure_mode: FailureMode,
) -> anyhow::Result<Vec<YarnSnapshot>> {
let files = vcs_list_files(repo)?;
let yarn_lock_files = files
.iter()
.filter(|file| file.ends_with("yarn.lock"))
.cloned()
.collect::<Vec<_>>();

let mut yarn_states = Vec::new();

for file in yarn_lock_files {
let yarn_lock_path = repo.join(file);
let node_modules_relative_path = yarn_lock_path.parent().unwrap().join("node_modules");

let yarn_lock_blob = std::fs::read(&yarn_lock_path)?;
let yarn_lock: Result<YarnLock, _> = parse_yarn_lock(&String::from_utf8(yarn_lock_blob)?);
if failure_mode != FailureMode::FailOnMissingNodeModules && yarn_lock.is_err() {
continue;
};
let yarn_lock = yarn_lock?;

let node_modules_path = repo.join(&node_modules_relative_path);
let yarn_state = parse_yarn_state_file(&node_modules_path).map_err(|err| {
tracing::warn!(
"Failed to read .yarn-state.yml {node_modules_relative_path:?} {}",
err
);

anyhow!("Failed to read .yarn-state.yml at {node_modules_relative_path:?}: {err}")
});
if failure_mode != FailureMode::FailOnMissingNodeModules && yarn_state.is_err() {
continue;
};
let yarn_state = yarn_state?;
let yarn_snapshot = YarnSnapshot {
yarn_lock_path,
yarn_lock,
yarn_state,
};
yarn_states.push(yarn_snapshot);
}
let yarn_lock_files = vcs_list_yarn_lock_files(repo)?;
tracing::info!("Found yarn.lock files");

let yarn_states = yarn_lock_files
.par_iter()
.map(|file| -> anyhow::Result<_> {
let yarn_lock_path = repo.join(file);
let node_modules_relative_path = yarn_lock_path.parent().unwrap().join("node_modules");

let yarn_lock_blob = std::fs::read(&yarn_lock_path)?;
let yarn_lock: Result<YarnLock, _> = parse_yarn_lock(&String::from_utf8(yarn_lock_blob)?);
if failure_mode != FailureMode::FailOnMissingNodeModules && yarn_lock.is_err() {
return Ok(None);
};
let yarn_lock = yarn_lock?;

let node_modules_path = repo.join(&node_modules_relative_path);
let yarn_state = parse_yarn_state_file(&node_modules_path).map_err(|err| {
tracing::warn!(
"Failed to read .yarn-state.yml {node_modules_relative_path:?} {}",
err
);

anyhow!("Failed to read .yarn-state.yml at {node_modules_relative_path:?}: {err}")
});
if failure_mode != FailureMode::FailOnMissingNodeModules && yarn_state.is_err() {
return Ok(None);
};
let yarn_state = yarn_state?;
let yarn_snapshot = YarnSnapshot {
yarn_lock_path,
yarn_lock,
yarn_state,
};

Ok(Some(yarn_snapshot))
})
.filter_map(|result| match result {
Ok(Some(yarn_snapshot)) => Some(Ok(yarn_snapshot)),
Ok(None) => None,
Err(err) => Some(Err(err)),
})
.collect::<anyhow::Result<Vec<_>>>()?;

Ok(yarn_states)
}
Expand Down Expand Up @@ -324,15 +332,18 @@ pub fn get_changed_files(
Ok(changed_files)
}

pub fn vcs_list_files(dir: &Path) -> Result<Vec<String>, anyhow::Error> {
pub fn vcs_list_yarn_lock_files(dir: &Path) -> Result<Vec<String>, anyhow::Error> {
let mut command = Command::new("git");
command
.arg("ls-files")
.arg("--cached")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it how we are going to report also staged files?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, this is just to list the yarn.lock files committed

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we dealing with staged files in a follow up PR?

The command in line 366 will report only unstaged

.arg("--exclude=yarn.lock")
.arg("--ignored")
.arg("-z") // We separate rows by \0 to avoid issues with newlines
.current_dir(dir);
let command = command.output()?;
if !command.status.success() {
tracing::error!("git lfs-files: {:?}", String::from_utf8(command.stderr));
tracing::error!("git ls-files: {:?}", String::from_utf8(command.stderr));
return Err(anyhow::anyhow!("Git ls-files failed"));
}
let output = String::from_utf8(command.stdout).unwrap();
Expand All @@ -354,50 +365,51 @@ pub fn vcs_list_dirty_files(
let mut command = Command::new("git");
command
.arg("ls-files")
// .arg("--format=%(objectname)%(path)")
.arg("--deleted")
.arg("--modified")
.arg("--others")
.arg("--exclude-standard")
.arg("-z") // We separate rows by \0 to avoid issues with newlines
.current_dir(dir);
for pattern in exclude_patterns {
command.arg(format!("--exclude={}", pattern));
}
let command = command.output()?;
if !command.status.success() {
tracing::error!("git lfs-files: {:?}", String::from_utf8(command.stderr));
tracing::error!("git ls-files: {:?}", String::from_utf8(command.stderr));
return Err(anyhow::anyhow!("Git ls-files failed"));
}
let output = String::from_utf8(command.stdout).unwrap();
let lines = output.split_terminator('\0');

let mut results = Vec::new();

for relative_path in lines {
tracing::debug!("Hashing file {}", relative_path);
let path = Path::new(relative_path);
let path = dir.join(path);

// We hash the contents of the file but if it's a symlink we hash the target
// path instead rather than following the link.
let metadata = std::fs::symlink_metadata(&path)?;
let contents = if metadata.is_symlink() {
std::fs::read_link(&path)?
.to_str()
.unwrap()
.as_bytes()
.to_vec()
} else {
std::fs::read(&path)?
};
let mut state = std::collections::hash_map::DefaultHasher::new();
contents.hash(&mut state);
let hash = state.finish();
results.push(VCSFile {
path: relative_path.into(),
hash,
});
}
let lines: Vec<_> = output.split_terminator('\0').map(String::from).collect();

let results = lines
.par_iter()
.map(|relative_path| {
tracing::info!("Hashing file {}", relative_path);
let path = Path::new(relative_path);
let path = dir.join(path);

// We hash the contents of the file but if it's a symlink we hash the target
// path instead rather than following the link.
let metadata = std::fs::symlink_metadata(&path)?;
let contents = if metadata.is_symlink() {
std::fs::read_link(&path)?
.to_str()
.unwrap()
.as_bytes()
.to_vec()
} else {
std::fs::read(&path)?
};
let mut state = std::collections::hash_map::DefaultHasher::new();
contents.hash(&mut state);
let hash = state.finish();
Ok(VCSFile {
path: relative_path.into(),
hash,
})
})
.collect::<anyhow::Result<Vec<_>>>()?;

Ok(results)
}
Loading