diff --git a/Cargo.lock b/Cargo.lock index 5bbaee2..d6df0af 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -660,6 +660,27 @@ version = "0.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" +[[package]] +name = "ownable" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dcba94d1536fcc470287d96fd26356c38da8215fdb9a74285b09621f35d9350" +dependencies = [ + "ownable-macro", +] + +[[package]] +name = "ownable-macro" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2c91d2781624dec1234581a1a01e63638f36546ad72ee82873ac1b84f41117b" +dependencies = [ + "proc-macro-error", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "phf" version = "0.11.2" @@ -748,6 +769,29 @@ dependencies = [ "toml_edit", ] +[[package]] +name = "proc-macro-error" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da25490ff9892aab3fcf7c36f08cfb902dd3e71ca0f9f9517bea02a73a5ce38c" +dependencies = [ + "proc-macro-error-attr", + "proc-macro2", + "quote", + "version_check", +] + +[[package]] +name = "proc-macro-error-attr" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1be40180e52ecc98ad80b184934baf3d0d29f979574e439af5a55274b35f869" +dependencies = [ + "proc-macro2", + "quote", + "version_check", +] + [[package]] name = "proc-macro2" version = "1.0.78" @@ -797,7 +841,9 @@ dependencies = [ "num_enum", "oem_cp", "oval", + "ownable", "pretty-hex", + "temp-dir", "test-log", "thiserror", "tracing", @@ -820,6 +866,7 @@ dependencies = [ "test-log", "tracing", "tracing-subscriber", + "winnow", ] [[package]] @@ -971,6 +1018,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "temp-dir" +version = "0.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd16aa9ffe15fe021c6ee3766772132c6e98dfa395a167e16864f61a9cfb71d6" + [[package]] name = "test-log" version = "0.2.14" @@ -1148,6 +1201,12 @@ version = "0.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" +[[package]] +name = "version_check" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" + [[package]] name = "wasm-bindgen" version = "0.2.90" diff --git a/Justfile b/Justfile index 1e869da..0b3e771 100644 --- a/Justfile +++ b/Justfile @@ -6,7 +6,7 @@ _default: check: cargo hack clippy --each-feature -docs: +doc: RUSTDOCFLAGS="-D warnings" cargo doc --all-features --no-deps # Run all tests locally diff --git a/rc-zip-sync/Cargo.toml b/rc-zip-sync/Cargo.toml index 020a80a..25b2c2f 100644 --- a/rc-zip-sync/Cargo.toml +++ b/rc-zip-sync/Cargo.toml @@ -24,6 +24,7 @@ positioned-io = { version = "0.3.3", optional = true } rc-zip = { version = "3.0.0", path = "../rc-zip" } oval = "2.0.0" tracing = "0.1.40" +winnow = "0.5.36" [features] default = ["file", "deflate"] diff --git a/rc-zip-sync/examples/jean.rs b/rc-zip-sync/examples/jean.rs index bf613d4..7d92451 100644 --- a/rc-zip-sync/examples/jean.rs +++ b/rc-zip-sync/examples/jean.rs @@ -1,8 +1,8 @@ use cfg_if::cfg_if; use clap::{Parser, Subcommand}; use humansize::{format_size, BINARY}; -use rc_zip::parse::{Archive, EntryContents, Method, Version}; -use rc_zip_sync::ReadZip; +use rc_zip::parse::{Archive, EntryKind, Method, Version}; +use rc_zip_sync::{ReadZip, ReadZipStreaming}; use std::{ borrow::Cow, @@ -62,6 +62,12 @@ enum Commands { Unzip { zipfile: PathBuf, + #[arg(long)] + dir: Option, + }, + UnzipStreaming { + zipfile: PathBuf, + #[arg(long)] dir: Option, }, @@ -76,15 +82,10 @@ fn main() { fn do_main(cli: Cli) -> Result<(), Box> { fn info(archive: &Archive) { - if let Some(comment) = archive.comment() { - println!("Comment:\n{}", comment); - } - let has_zip64 = archive.entries().any(|entry| entry.inner.is_zip64); - if has_zip64 { - println!("Found Zip64 end of central directory locator") + if !archive.comment().is_empty() { + println!("Comment:\n{}", archive.comment()); } - let mut creator_versions = HashSet::::new(); let mut reader_versions = HashSet::::new(); let mut methods = HashSet::::new(); let mut compressed_size: u64 = 0; @@ -94,27 +95,23 @@ fn do_main(cli: Cli) -> Result<(), Box> { let mut num_files = 0; for entry in archive.entries() { - creator_versions.insert(entry.creator_version); reader_versions.insert(entry.reader_version); - match entry.contents() { - EntryContents::Symlink => { + match entry.kind() { + EntryKind::Symlink => { num_symlinks += 1; } - EntryContents::Directory => { + EntryKind::Directory => { num_dirs += 1; } - EntryContents::File => { - methods.insert(entry.method()); + EntryKind::File => { + methods.insert(entry.method); num_files += 1; - compressed_size += entry.inner.compressed_size; - uncompressed_size += entry.inner.uncompressed_size; + compressed_size += entry.compressed_size; + uncompressed_size += entry.uncompressed_size; } } } - println!( - "Version made by: {:?}, required: {:?}", - creator_versions, reader_versions - ); + println!("Versions: {:?}", reader_versions); println!("Encoding: {}, Methods: {:?}", archive.encoding(), methods); println!( "{} ({:.2}% compression) ({} files, {} dirs, {} symlinks)", @@ -142,36 +139,33 @@ fn do_main(cli: Cli) -> Result<(), Box> { "{mode:>9} {size:>12} {name}", mode = entry.mode, name = if verbose { - Cow::from(entry.name()) + Cow::Borrowed(&entry.name) } else { - Cow::from(entry.name().truncate_path(55)) + Cow::Owned(entry.name.truncate_path(55)) }, - size = format_size(entry.inner.uncompressed_size, BINARY), + size = format_size(entry.uncompressed_size, BINARY), ); if verbose { print!( " ({} compressed)", - format_size(entry.inner.compressed_size, BINARY) + format_size(entry.compressed_size, BINARY) ); print!( " {modified} {uid} {gid}", - modified = entry.modified(), + modified = entry.modified, uid = Optional(entry.uid), gid = Optional(entry.gid), ); - if let EntryContents::Symlink = entry.contents() { + if let EntryKind::Symlink = entry.kind() { let mut target = String::new(); entry.reader().read_to_string(&mut target).unwrap(); print!("\t{target}", target = target); } - print!("\t{:?}", entry.method()); - if entry.inner.is_zip64 { - print!("\tZip64"); - } - if let Some(comment) = entry.comment() { - print!("\t{comment}", comment = comment); + print!("\t{:?}", entry.method); + if !entry.comment.is_empty() { + print!("\t{comment}", comment = entry.comment); } } println!(); @@ -185,12 +179,10 @@ fn do_main(cli: Cli) -> Result<(), Box> { let mut num_dirs = 0; let mut num_files = 0; let mut num_symlinks = 0; - let mut uncompressed_size: u64 = 0; - for entry in reader.entries() { - if let EntryContents::File = entry.contents() { - uncompressed_size += entry.inner.uncompressed_size; - } - } + let uncompressed_size = reader + .entries() + .map(|entry| entry.uncompressed_size) + .sum::(); let mut done_bytes: u64 = 0; use indicatif::{ProgressBar, ProgressStyle}; @@ -206,15 +198,14 @@ fn do_main(cli: Cli) -> Result<(), Box> { let start_time = std::time::SystemTime::now(); for entry in reader.entries() { - let entry_name = entry.name(); - let entry_name = match sanitize_entry_name(entry_name) { + let entry_name = match entry.sanitized_name() { Some(name) => name, None => continue, }; pbar.set_message(entry_name.to_string()); - match entry.contents() { - EntryContents::Symlink => { + match entry.kind() { + EntryKind::Symlink => { num_symlinks += 1; cfg_if! { @@ -250,7 +241,7 @@ fn do_main(cli: Cli) -> Result<(), Box> { } } } - EntryContents::Directory => { + EntryKind::Directory => { num_dirs += 1; let path = dir.join(entry_name); std::fs::create_dir_all( @@ -258,7 +249,7 @@ fn do_main(cli: Cli) -> Result<(), Box> { .expect("all full entry paths should have parent paths"), )?; } - EntryContents::File => { + EntryKind::File => { num_files += 1; let path = dir.join(entry_name); std::fs::create_dir_all( @@ -268,16 +259,134 @@ fn do_main(cli: Cli) -> Result<(), Box> { let mut entry_writer = File::create(path)?; let entry_reader = entry.reader(); let before_entry_bytes = done_bytes; - let mut progress_reader = ProgressRead::new( - entry_reader, - entry.inner.uncompressed_size, - |prog| { + let mut progress_reader = + ProgressReader::new(entry_reader, entry.uncompressed_size, |prog| { + pbar.set_position(before_entry_bytes + prog.done); + }); + + let copied_bytes = std::io::copy(&mut progress_reader, &mut entry_writer)?; + done_bytes = before_entry_bytes + copied_bytes; + } + } + } + pbar.finish(); + let duration = start_time.elapsed()?; + println!( + "Extracted {} (in {} files, {} dirs, {} symlinks)", + format_size(uncompressed_size, BINARY), + num_files, + num_dirs, + num_symlinks + ); + let seconds = (duration.as_millis() as f64) / 1000.0; + let bps = (uncompressed_size as f64 / seconds) as u64; + println!("Overall extraction speed: {} / s", format_size(bps, BINARY)); + } + Commands::UnzipStreaming { zipfile, dir, .. } => { + let zipfile = File::open(zipfile)?; + let dir = PathBuf::from(dir.unwrap_or_else(|| ".".into())); + + let mut num_dirs = 0; + let mut num_files = 0; + let mut num_symlinks = 0; + + let mut done_bytes: u64 = 0; + use indicatif::{ProgressBar, ProgressStyle}; + let pbar = ProgressBar::new(100); + pbar.set_style( + ProgressStyle::default_bar() + .template("{eta_precise} [{bar:20.cyan/blue}] {wide_msg}") + .unwrap() + .progress_chars("=>-"), + ); + + let mut uncompressed_size = 0; + pbar.enable_steady_tick(Duration::from_millis(125)); + + let start_time = std::time::SystemTime::now(); + + let mut entry_reader = zipfile.stream_zip_entries_throwing_caution_to_the_wind()?; + loop { + let entry_name = match entry_reader.entry().sanitized_name() { + Some(name) => name, + None => continue, + }; + + pbar.set_message(entry_name.to_string()); + match entry_reader.entry().kind() { + EntryKind::Symlink => { + num_symlinks += 1; + + cfg_if! { + if #[cfg(windows)] { + let path = dir.join(entry_name); + std::fs::create_dir_all( + path.parent() + .expect("all full entry paths should have parent paths"), + )?; + let mut entry_writer = File::create(path)?; + let mut entry_reader = entry.reader(); + std::io::copy(&mut entry_reader, &mut entry_writer)?; + } else { + let path = dir.join(entry_name); + std::fs::create_dir_all( + path.parent() + .expect("all full entry paths should have parent paths"), + )?; + if let Ok(metadata) = std::fs::symlink_metadata(&path) { + if metadata.is_file() { + std::fs::remove_file(&path)?; + } + } + + let mut src = String::new(); + entry_reader.read_to_string(&mut src)?; + + // validate pointing path before creating a symbolic link + if src.contains("..") { + continue; + } + std::os::unix::fs::symlink(src, &path)?; + } + } + } + EntryKind::Directory => { + num_dirs += 1; + let path = dir.join(entry_name); + std::fs::create_dir_all( + path.parent() + .expect("all full entry paths should have parent paths"), + )?; + } + EntryKind::File => { + num_files += 1; + let path = dir.join(entry_name); + std::fs::create_dir_all( + path.parent() + .expect("all full entry paths should have parent paths"), + )?; + let mut entry_writer = File::create(path)?; + let before_entry_bytes = done_bytes; + let total = entry_reader.entry().uncompressed_size; + let mut progress_reader = + ProgressReader::new(entry_reader, total, |prog| { pbar.set_position(before_entry_bytes + prog.done); - }, - ); + }); let copied_bytes = std::io::copy(&mut progress_reader, &mut entry_writer)?; + uncompressed_size += copied_bytes; done_bytes = before_entry_bytes + copied_bytes; + entry_reader = progress_reader.into_inner(); + } + } + + match entry_reader.finish()? { + Some(next_entry) => { + entry_reader = next_entry; + } + None => { + println!("End of archive!"); + break; } } } @@ -303,7 +412,7 @@ trait Truncate { fn truncate_path(&self, limit: usize) -> String; } -impl Truncate for &str { +impl Truncate for String { fn truncate_path(&self, limit: usize) -> String { let mut name_tokens: Vec<&str> = Vec::new(); let mut rest_tokens: std::collections::VecDeque<&str> = self.split('/').collect(); @@ -336,7 +445,7 @@ struct Progress { total: u64, } -struct ProgressRead +struct ProgressReader where R: io::Read, F: Fn(Progress), @@ -346,7 +455,7 @@ where progress: Progress, } -impl ProgressRead +impl ProgressReader where R: io::Read, F: Fn(Progress), @@ -360,7 +469,7 @@ where } } -impl io::Read for ProgressRead +impl io::Read for ProgressReader where R: io::Read, F: Fn(Progress), @@ -375,32 +484,12 @@ where } } -/// Sanitize zip entry names: skip entries with traversed/absolute path to -/// mitigate zip slip, and strip absolute prefix on entries pointing to root -/// path. -fn sanitize_entry_name(name: &str) -> Option<&str> { - // refuse entries with traversed/absolute path to mitigate zip slip - if name.contains("..") { - return None; - } - - #[cfg(windows)] - { - if name.contains(":\\") || name.starts_with("\\") { - return None; - } - Some(name) - } - - #[cfg(not(windows))] - { - // strip absolute prefix on entries pointing to root path - let mut entry_chars = name.chars(); - let mut name = name; - while name.starts_with('/') { - entry_chars.next(); - name = entry_chars.as_str() - } - Some(name) +impl ProgressReader +where + R: io::Read, + F: Fn(Progress), +{ + fn into_inner(self) -> R { + self.inner } } diff --git a/rc-zip-sync/src/entry_reader.rs b/rc-zip-sync/src/entry_reader.rs index a48a6df..89b25c5 100644 --- a/rc-zip-sync/src/entry_reader.rs +++ b/rc-zip-sync/src/entry_reader.rs @@ -1,8 +1,9 @@ use rc_zip::{ fsm::{EntryFsm, FsmResult}, - parse::StoredEntry, + parse::Entry, }; use std::io; +use tracing::trace; pub(crate) struct EntryReader where @@ -16,10 +17,10 @@ impl EntryReader where R: io::Read, { - pub(crate) fn new(entry: &StoredEntry, rd: R) -> Self { + pub(crate) fn new(entry: &Entry, rd: R) -> Self { Self { rd, - fsm: Some(EntryFsm::new(entry.method(), entry.inner)), + fsm: Some(EntryFsm::new(Some(entry.clone()), None)), } } } @@ -35,26 +36,31 @@ where }; if fsm.wants_read() { - tracing::trace!("fsm wants read"); + trace!("fsm wants read"); let n = self.rd.read(fsm.space())?; - tracing::trace!("giving fsm {} bytes", n); + trace!("giving fsm {} bytes", n); fsm.fill(n); } else { - tracing::trace!("fsm does not want read"); + trace!("fsm does not want read"); } match fsm.process(buf)? { FsmResult::Continue((fsm, outcome)) => { self.fsm = Some(fsm); + if outcome.bytes_written > 0 { Ok(outcome.bytes_written) + } else if outcome.bytes_read == 0 { + // that's EOF, baby! + Ok(0) } else { // loop, it happens self.read(buf) } } - FsmResult::Done(()) => { + FsmResult::Done(_) => { // neat! + trace!("fsm done"); Ok(0) } } diff --git a/rc-zip-sync/src/lib.rs b/rc-zip-sync/src/lib.rs index 304a1dd..d1c890a 100644 --- a/rc-zip-sync/src/lib.rs +++ b/rc-zip-sync/src/lib.rs @@ -10,6 +10,11 @@ mod entry_reader; mod read_zip; +mod streaming_entry_reader; +pub use streaming_entry_reader::StreamingEntryReader; + // re-exports pub use rc_zip; -pub use read_zip::{HasCursor, ReadZip, ReadZipWithSize, SyncArchive, SyncStoredEntry}; +pub use read_zip::{ + ArchiveHandle, EntryHandle, HasCursor, ReadZip, ReadZipStreaming, ReadZipWithSize, +}; diff --git a/rc-zip-sync/src/read_zip.rs b/rc-zip-sync/src/read_zip.rs index 3089090..a9679d5 100644 --- a/rc-zip-sync/src/read_zip.rs +++ b/rc-zip-sync/src/read_zip.rs @@ -1,10 +1,13 @@ use rc_zip::{ error::Error, fsm::{ArchiveFsm, FsmResult}, - parse::{Archive, StoredEntry}, + parse::Archive, }; +use rc_zip::{fsm::EntryFsm, parse::Entry}; +use tracing::trace; use crate::entry_reader::EntryReader; +use crate::streaming_entry_reader::StreamingEntryReader; use std::{io::Read, ops::Deref}; /// A trait for reading something as a zip archive @@ -15,7 +18,7 @@ pub trait ReadZipWithSize { type File: HasCursor; /// Reads self as a zip archive. - fn read_zip_with_size(&self, size: u64) -> Result, Error>; + fn read_zip_with_size(&self, size: u64) -> Result, Error>; } /// A trait for reading something as a zip archive when we can tell size from @@ -27,7 +30,7 @@ pub trait ReadZip { type File: HasCursor; /// Reads self as a zip archive. - fn read_zip(&self) -> Result, Error>; + fn read_zip(&self) -> Result, Error>; } impl ReadZipWithSize for F @@ -36,15 +39,15 @@ where { type File = F; - fn read_zip_with_size(&self, size: u64) -> Result, Error> { - tracing::trace!(%size, "read_zip_with_size"); + fn read_zip_with_size(&self, size: u64) -> Result, Error> { + trace!(%size, "read_zip_with_size"); let mut fsm = ArchiveFsm::new(size); loop { if let Some(offset) = fsm.wants_read() { - tracing::trace!(%offset, "read_zip_with_size: wants_read, space len = {}", fsm.space().len()); + trace!(%offset, "read_zip_with_size: wants_read, space len = {}", fsm.space().len()); match self.cursor_at(offset).read(fsm.space()) { Ok(read_bytes) => { - tracing::trace!(%read_bytes, "read_zip_with_size: read"); + trace!(%read_bytes, "read_zip_with_size: read"); if read_bytes == 0 { return Err(Error::IO(std::io::ErrorKind::UnexpectedEof.into())); } @@ -56,8 +59,8 @@ where fsm = match fsm.process()? { FsmResult::Done(archive) => { - tracing::trace!("read_zip_with_size: done"); - return Ok(SyncArchive { + trace!("read_zip_with_size: done"); + return Ok(ArchiveHandle { file: self, archive, }); @@ -71,7 +74,7 @@ where impl ReadZip for &[u8] { type File = Self; - fn read_zip(&self) -> Result, Error> { + fn read_zip(&self) -> Result, Error> { self.read_zip_with_size(self.len() as u64) } } @@ -79,7 +82,7 @@ impl ReadZip for &[u8] { impl ReadZip for Vec { type File = Self; - fn read_zip(&self) -> Result, Error> { + fn read_zip(&self) -> Result, Error> { self.read_zip_with_size(self.len() as u64) } } @@ -88,8 +91,8 @@ impl ReadZip for Vec { /// /// This only contains metadata for the archive and its entries. Separate /// readers can be created for arbitraries entries on-demand using -/// [SyncStoredEntry::reader]. -pub struct SyncArchive<'a, F> +/// [EntryHandle::reader]. +pub struct ArchiveHandle<'a, F> where F: HasCursor, { @@ -97,7 +100,7 @@ where archive: Archive, } -impl Deref for SyncArchive<'_, F> +impl Deref for ArchiveHandle<'_, F> where F: HasCursor, { @@ -108,13 +111,13 @@ where } } -impl SyncArchive<'_, F> +impl ArchiveHandle<'_, F> where F: HasCursor, { /// Iterate over all files in this zip, read from the central directory. - pub fn entries(&self) -> impl Iterator> { - self.archive.entries().map(move |entry| SyncStoredEntry { + pub fn entries(&self) -> impl Iterator> { + self.archive.entries().map(move |entry| EntryHandle { file: self.file, entry, }) @@ -122,11 +125,11 @@ where /// Attempts to look up an entry by name. This is usually a bad idea, /// as names aren't necessarily normalized in zip archives. - pub fn by_name>(&self, name: N) -> Option> { + pub fn by_name>(&self, name: N) -> Option> { self.archive .entries() - .find(|&x| x.name() == name.as_ref()) - .map(|entry| SyncStoredEntry { + .find(|&x| x.name == name.as_ref()) + .map(|entry| EntryHandle { file: self.file, entry, }) @@ -134,20 +137,20 @@ where } /// A zip entry, read synchronously from a file or other I/O resource. -pub struct SyncStoredEntry<'a, F> { +pub struct EntryHandle<'a, F> { file: &'a F, - entry: &'a StoredEntry, + entry: &'a Entry, } -impl Deref for SyncStoredEntry<'_, F> { - type Target = StoredEntry; +impl Deref for EntryHandle<'_, F> { + type Target = Entry; fn deref(&self) -> &Self::Target { self.entry } } -impl<'a, F> SyncStoredEntry<'a, F> +impl<'a, F> EntryHandle<'a, F> where F: HasCursor, { @@ -210,8 +213,50 @@ impl HasCursor for std::fs::File { impl ReadZip for std::fs::File { type File = Self; - fn read_zip(&self) -> Result, Error> { + fn read_zip(&self) -> Result, Error> { let size = self.metadata()?.len(); self.read_zip_with_size(size) } } + +/// Allows reading zip entries in a streaming fashion, without seeking, +/// based only on local headers. THIS IS NOT RECOMMENDED, as correctly +/// reading zip files requires reading the central directory (located at +/// the end of the file). +pub trait ReadZipStreaming +where + R: Read, +{ + /// Get the first zip entry from the stream as a [StreamingEntryReader]. + /// + /// See the trait's documentation for why using this is + /// generally a bad idea: you might want to use [ReadZip] or + /// [ReadZipWithSize] instead. + fn stream_zip_entries_throwing_caution_to_the_wind( + self, + ) -> Result, Error>; +} + +impl ReadZipStreaming for R +where + R: Read, +{ + fn stream_zip_entries_throwing_caution_to_the_wind( + mut self, + ) -> Result, Error> { + let mut fsm = EntryFsm::new(None, None); + + loop { + if fsm.wants_read() { + let n = self.read(fsm.space())?; + trace!("read {} bytes into buf for first zip entry", n); + fsm.fill(n); + } + + if let Some(entry) = fsm.process_till_header()? { + let entry = entry.clone(); + return Ok(StreamingEntryReader::new(fsm, entry, self)); + } + } + } +} diff --git a/rc-zip-sync/src/streaming_entry_reader.rs b/rc-zip-sync/src/streaming_entry_reader.rs new file mode 100644 index 0000000..325731a --- /dev/null +++ b/rc-zip-sync/src/streaming_entry_reader.rs @@ -0,0 +1,161 @@ +use oval::Buffer; +use rc_zip::{ + error::{Error, FormatError}, + fsm::{EntryFsm, FsmResult}, + parse::Entry, +}; +use std::io::{self, Read}; +use tracing::trace; + +/// Reads a zip entry based on a local header. Some information is missing, +/// not all name encodings may work, and only by reading it in its entirety +/// can you move on to the next entry. +/// +/// However, it only requires an [io::Read], and does not need to seek. +pub struct StreamingEntryReader { + entry: Entry, + rd: R, + state: State, +} + +#[derive(Default)] +#[allow(clippy::large_enum_variant)] +enum State { + Reading { + fsm: EntryFsm, + }, + Finished { + /// remaining buffer for next entry + remain: Buffer, + }, + #[default] + Transition, +} + +impl StreamingEntryReader +where + R: io::Read, +{ + pub(crate) fn new(fsm: EntryFsm, entry: Entry, rd: R) -> Self { + Self { + entry, + rd, + state: State::Reading { fsm }, + } + } +} + +impl io::Read for StreamingEntryReader +where + R: io::Read, +{ + fn read(&mut self, buf: &mut [u8]) -> io::Result { + trace!("reading from streaming entry reader"); + + match std::mem::take(&mut self.state) { + State::Reading { mut fsm } => { + if fsm.wants_read() { + trace!("fsm wants read"); + let n = self.rd.read(fsm.space())?; + trace!("giving fsm {} bytes from rd", n); + fsm.fill(n); + } else { + trace!("fsm does not want read"); + } + + match fsm.process(buf)? { + FsmResult::Continue((fsm, outcome)) => { + trace!("fsm wants to continue"); + self.state = State::Reading { fsm }; + + if outcome.bytes_written > 0 { + trace!("bytes have been written"); + Ok(outcome.bytes_written) + } else if outcome.bytes_read == 0 { + trace!("no bytes have been written or read"); + // that's EOF, baby! + Ok(0) + } else { + trace!("read some bytes, hopefully will write more later"); + // loop, it happens + self.read(buf) + } + } + FsmResult::Done(remain) => { + self.state = State::Finished { remain }; + + // neat! + Ok(0) + } + } + } + State::Finished { remain } => { + // wait for them to call finish + self.state = State::Finished { remain }; + Ok(0) + } + State::Transition => unreachable!(), + } + } +} + +impl StreamingEntryReader +where + R: io::Read, +{ + /// Return entry information for this reader + #[inline(always)] + pub fn entry(&self) -> &Entry { + &self.entry + } + + /// Finish reading this entry, returning the next streaming entry reader, if + /// any. This panics if the entry is not fully read. + /// + /// If this returns None, there's no entries left. + pub fn finish(mut self) -> Result>, Error> { + trace!("finishing streaming entry reader"); + + if matches!(self.state, State::Reading { .. }) { + // this should transition to finished if there's no data + _ = self.read(&mut [0u8; 1])?; + } + + match self.state { + State::Reading { .. } => { + panic!("entry not fully read"); + } + State::Finished { remain } => { + // parse the next entry, if any + let mut fsm = EntryFsm::new(None, Some(remain)); + + loop { + if fsm.wants_read() { + let n = self.rd.read(fsm.space())?; + trace!("read {} bytes into buf for first zip entry", n); + fsm.fill(n); + } + + match fsm.process_till_header() { + Ok(Some(entry)) => { + let entry = entry.clone(); + return Ok(Some(StreamingEntryReader::new(fsm, entry, self.rd))); + } + Ok(None) => { + // needs more turns + } + Err(e) => match e { + Error::Format(FormatError::InvalidLocalHeader) => { + // we probably reached the end of central directory! + // TODO: we should probably check for the end of central directory + return Ok(None); + } + _ => return Err(e), + }, + } + } + } + State::Transition => unreachable!(), + } + } +} diff --git a/rc-zip-sync/tests/integration_tests.rs b/rc-zip-sync/tests/integration_tests.rs index a459318..3df5500 100644 --- a/rc-zip-sync/tests/integration_tests.rs +++ b/rc-zip-sync/tests/integration_tests.rs @@ -1,25 +1,29 @@ use rc_zip::{ - corpus::{self, zips_dir, Case}, + corpus::{self, zips_dir, Case, Files}, error::Error, parse::Archive, }; -use rc_zip_sync::{HasCursor, ReadZip, SyncArchive}; +use rc_zip_sync::{ArchiveHandle, HasCursor, ReadZip, ReadZipStreaming}; -use std::fs::File; +use std::{fs::File, io::Read}; -fn check_case(test: &Case, archive: Result, Error>) { +fn check_case(test: &Case, archive: Result, Error>) { corpus::check_case(test, archive.as_ref().map(|ar| -> &Archive { ar })); let archive = match archive { Ok(archive) => archive, Err(_) => return, }; - for file in &test.files { - let entry = archive - .by_name(file.name) - .unwrap_or_else(|| panic!("entry {} should exist", file.name)); + if let Files::ExhaustiveList(files) = &test.files { + for file in files { + tracing::info!("checking file {}", file.name); + let entry = archive + .by_name(file.name) + .unwrap_or_else(|| panic!("entry {} should exist", file.name)); - corpus::check_file_against(file, &entry, &entry.bytes().unwrap()[..]) + tracing::info!("got entry for {}", file.name); + corpus::check_file_against(file, &entry, &entry.bytes().unwrap()[..]) + } } } @@ -43,9 +47,34 @@ fn real_world_files() { for case in corpus::test_cases() { tracing::info!("============ testing {}", case.name); - let file = File::open(case.absolute_path()).unwrap(); + let guarded_path = case.absolute_path(); + let file = File::open(&guarded_path.path).unwrap(); let archive = file.read_zip().map_err(Error::from); + check_case(&case, archive); + drop(guarded_path) + } +} + +#[test_log::test] +fn streaming() { + for case in corpus::streaming_test_cases() { + let guarded_path = case.absolute_path(); + let file = File::open(&guarded_path.path).unwrap(); + + let mut entry = file + .stream_zip_entries_throwing_caution_to_the_wind() + .unwrap(); + loop { + let mut v = vec![]; + let n = entry.read_to_end(&mut v).unwrap(); + tracing::trace!("entry {} read {} bytes", entry.entry().name, n); + + match entry.finish().unwrap() { + Some(next) => entry = next, + None => break, + } + } - check_case(&case, archive) + drop(guarded_path) } } diff --git a/rc-zip-tokio/src/entry_reader.rs b/rc-zip-tokio/src/entry_reader.rs index c4af59b..094da2a 100644 --- a/rc-zip-tokio/src/entry_reader.rs +++ b/rc-zip-tokio/src/entry_reader.rs @@ -3,7 +3,7 @@ use std::{pin::Pin, task}; use pin_project_lite::pin_project; use rc_zip::{ fsm::{EntryFsm, FsmResult}, - parse::StoredEntry, + parse::Entry, }; use tokio::io::{AsyncRead, ReadBuf}; @@ -22,13 +22,13 @@ impl EntryReader where R: AsyncRead, { - pub(crate) fn new(entry: &StoredEntry, get_reader: F) -> Self + pub(crate) fn new(entry: &Entry, get_reader: F) -> Self where F: Fn(u64) -> R, { Self { rd: get_reader(entry.header_offset), - fsm: Some(EntryFsm::new(entry.method(), entry.inner)), + fsm: Some(EntryFsm::new(Some(entry.clone()), None)), } } } @@ -73,12 +73,14 @@ where if outcome.bytes_written > 0 { tracing::trace!("wrote {} bytes", outcome.bytes_written); buf.advance(outcome.bytes_written); + } else if outcome.bytes_read == 0 { + // that's EOF, baby! } else { // loop, it happens return self.poll_read(cx, buf); } } - FsmResult::Done(()) => { + FsmResult::Done(_) => { // neat! } } diff --git a/rc-zip-tokio/src/lib.rs b/rc-zip-tokio/src/lib.rs index 8666c73..aed77c9 100644 --- a/rc-zip-tokio/src/lib.rs +++ b/rc-zip-tokio/src/lib.rs @@ -7,11 +7,14 @@ #![warn(missing_docs)] -mod async_read_zip; mod entry_reader; +mod read_zip; + +mod streaming_entry_reader; +pub use streaming_entry_reader::StreamingEntryReader; // re-exports -pub use async_read_zip::{ - AsyncArchive, AsyncStoredEntry, HasAsyncCursor, ReadZipAsync, ReadZipWithSizeAsync, -}; pub use rc_zip; +pub use read_zip::{ + ArchiveHandle, EntryHandle, HasCursor, ReadZip, ReadZipStreaming, ReadZipWithSize, +}; diff --git a/rc-zip-tokio/src/async_read_zip.rs b/rc-zip-tokio/src/read_zip.rs similarity index 63% rename from rc-zip-tokio/src/async_read_zip.rs rename to rc-zip-tokio/src/read_zip.rs index bc68be0..f64dae6 100644 --- a/rc-zip-tokio/src/async_read_zip.rs +++ b/rc-zip-tokio/src/read_zip.rs @@ -1,4 +1,4 @@ -use std::{io, ops::Deref, pin::Pin, sync::Arc, task}; +use std::{cmp, io, ops::Deref, pin::Pin, sync::Arc, task}; use futures::future::BoxFuture; use positioned_io::{RandomAccessFile, ReadAt, Size}; @@ -6,48 +6,46 @@ use tokio::io::{AsyncRead, AsyncReadExt, ReadBuf}; use rc_zip::{ error::Error, - fsm::{ArchiveFsm, FsmResult}, - parse::{Archive, StoredEntry}, + fsm::{ArchiveFsm, EntryFsm, FsmResult}, + parse::{Archive, Entry}, }; +use tracing::trace; -use crate::entry_reader::EntryReader; +use crate::{entry_reader::EntryReader, StreamingEntryReader}; /// A trait for reading something as a zip archive. /// -/// See also [ReadZipAsync]. -pub trait ReadZipWithSizeAsync { +/// See also [ReadZip]. +pub trait ReadZipWithSize { /// The type of the file to read from. - type File: HasAsyncCursor; + type File: HasCursor; /// Reads self as a zip archive. #[allow(async_fn_in_trait)] - async fn read_zip_with_size_async( - &self, - size: u64, - ) -> Result, Error>; + async fn read_zip_with_size(&self, size: u64) -> Result, Error>; } /// A zip archive, read asynchronously from a file or other I/O resource. /// /// This only contains metadata for the archive and its entries. Separate /// readers can be created for arbitraries entries on-demand using -/// [AsyncStoredEntry::reader]. -pub trait ReadZipAsync { +/// [EntryHandle::reader]. +pub trait ReadZip { /// The type of the file to read from. - type File: HasAsyncCursor; + type File: HasCursor; /// Reads self as a zip archive. #[allow(async_fn_in_trait)] - async fn read_zip_async(&self) -> Result, Error>; + async fn read_zip(&self) -> Result, Error>; } -impl ReadZipWithSizeAsync for F +impl ReadZipWithSize for F where - F: HasAsyncCursor, + F: HasCursor, { type File = F; - async fn read_zip_with_size_async(&self, size: u64) -> Result, Error> { + async fn read_zip_with_size(&self, size: u64) -> Result, Error> { let mut fsm = ArchiveFsm::new(size); loop { if let Some(offset) = fsm.wants_read() { @@ -64,7 +62,7 @@ where fsm = match fsm.process()? { FsmResult::Done(archive) => { - return Ok(AsyncArchive { + return Ok(ArchiveHandle { file: self, archive, }) @@ -75,43 +73,43 @@ where } } -impl ReadZipAsync for &[u8] { +impl ReadZip for &[u8] { type File = Self; - async fn read_zip_async(&self) -> Result, Error> { - self.read_zip_with_size_async(self.len() as u64).await + async fn read_zip(&self) -> Result, Error> { + self.read_zip_with_size(self.len() as u64).await } } -impl ReadZipAsync for Vec { +impl ReadZip for Vec { type File = Self; - async fn read_zip_async(&self) -> Result, Error> { - self.read_zip_with_size_async(self.len() as u64).await + async fn read_zip(&self) -> Result, Error> { + self.read_zip_with_size(self.len() as u64).await } } -impl ReadZipAsync for Arc { +impl ReadZip for Arc { type File = Self; - async fn read_zip_async(&self) -> Result, Error> { + async fn read_zip(&self) -> Result, Error> { let size = self.size()?.unwrap_or_default(); - self.read_zip_with_size_async(size).await + self.read_zip_with_size(size).await } } /// A zip archive, read asynchronously from a file or other I/O resource. -pub struct AsyncArchive<'a, F> +pub struct ArchiveHandle<'a, F> where - F: HasAsyncCursor, + F: HasCursor, { file: &'a F, archive: Archive, } -impl Deref for AsyncArchive<'_, F> +impl Deref for ArchiveHandle<'_, F> where - F: HasAsyncCursor, + F: HasCursor, { type Target = Archive; @@ -120,13 +118,13 @@ where } } -impl AsyncArchive<'_, F> +impl ArchiveHandle<'_, F> where - F: HasAsyncCursor, + F: HasCursor, { /// Iterate over all files in this zip, read from the central directory. - pub fn entries(&self) -> impl Iterator> { - self.archive.entries().map(move |entry| AsyncStoredEntry { + pub fn entries(&self) -> impl Iterator> { + self.archive.entries().map(move |entry| EntryHandle { file: self.file, entry, }) @@ -134,11 +132,11 @@ where /// Attempts to look up an entry by name. This is usually a bad idea, /// as names aren't necessarily normalized in zip archives. - pub fn by_name>(&self, name: N) -> Option> { + pub fn by_name>(&self, name: N) -> Option> { self.archive .entries() - .find(|&x| x.name() == name.as_ref()) - .map(|entry| AsyncStoredEntry { + .find(|&x| x.name == name.as_ref()) + .map(|entry| EntryHandle { file: self.file, entry, }) @@ -146,22 +144,22 @@ where } /// A single entry in a zip archive, read asynchronously from a file or other I/O resource. -pub struct AsyncStoredEntry<'a, F> { +pub struct EntryHandle<'a, F> { file: &'a F, - entry: &'a StoredEntry, + entry: &'a Entry, } -impl Deref for AsyncStoredEntry<'_, F> { - type Target = StoredEntry; +impl Deref for EntryHandle<'_, F> { + type Target = Entry; fn deref(&self) -> &Self::Target { self.entry } } -impl<'a, F> AsyncStoredEntry<'a, F> +impl<'a, F> EntryHandle<'a, F> where - F: HasAsyncCursor, + F: HasCursor, { /// Returns a reader for the entry. pub fn reader(&self) -> impl AsyncRead + Unpin + '_ { @@ -177,8 +175,8 @@ where } /// A sliceable I/O resource: we can ask for an [AsyncRead] at a given offset. -pub trait HasAsyncCursor { - /// The type returned by [HasAsyncCursor::cursor_at]. +pub trait HasCursor { + /// The type returned by [HasCursor::cursor_at]. type Cursor<'a>: AsyncRead + Unpin + 'a where Self: 'a; @@ -187,7 +185,7 @@ pub trait HasAsyncCursor { fn cursor_at(&self, offset: u64) -> Self::Cursor<'_>; } -impl HasAsyncCursor for &[u8] { +impl HasCursor for &[u8] { type Cursor<'a> = &'a [u8] where Self: 'a; @@ -197,7 +195,7 @@ impl HasAsyncCursor for &[u8] { } } -impl HasAsyncCursor for Vec { +impl HasCursor for Vec { type Cursor<'a> = &'a [u8] where Self: 'a; @@ -207,7 +205,7 @@ impl HasAsyncCursor for Vec { } } -impl HasAsyncCursor for Arc { +impl HasCursor for Arc { type Cursor<'a> = AsyncRandomAccessFileCursor where Self: 'a; @@ -259,9 +257,10 @@ impl AsyncRead for AsyncRandomAccessFileCursor { ARAFCState::Idle(core) => core, _ => unreachable!(), }; + let read_len = cmp::min(buf.remaining(), core.inner_buf.len()); let pos = self.pos; let fut = Box::pin(tokio::task::spawn_blocking(move || { - let read = core.file.read_at(pos, &mut core.inner_buf); + let read = core.file.read_at(pos, &mut core.inner_buf[..read_len]); (read, core) })); self.state = ARAFCState::Reading { fut }; @@ -292,3 +291,46 @@ impl AsyncRead for AsyncRandomAccessFileCursor { } } } + +/// Allows reading zip entries in a streaming fashion, without seeking, +/// based only on local headers. THIS IS NOT RECOMMENDED, as correctly +/// reading zip files requires reading the central directory (located at +/// the end of the file). +pub trait ReadZipStreaming +where + R: AsyncRead, +{ + /// Get the first zip entry from the stream as a [StreamingEntryReader]. + /// + /// See the trait's documentation for why using this is + /// generally a bad idea: you might want to use [ReadZip] or + /// [ReadZipWithSize] instead. + #[allow(async_fn_in_trait)] + async fn stream_zip_entries_throwing_caution_to_the_wind( + self, + ) -> Result, Error>; +} + +impl ReadZipStreaming for R +where + R: AsyncRead + Unpin, +{ + async fn stream_zip_entries_throwing_caution_to_the_wind( + mut self, + ) -> Result, Error> { + let mut fsm = EntryFsm::new(None, None); + + loop { + if fsm.wants_read() { + let n = self.read(fsm.space()).await?; + trace!("read {} bytes into buf for first zip entry", n); + fsm.fill(n); + } + + if let Some(entry) = fsm.process_till_header()? { + let entry = entry.clone(); + return Ok(StreamingEntryReader::new(fsm, entry, self)); + } + } + } +} diff --git a/rc-zip-tokio/src/streaming_entry_reader.rs b/rc-zip-tokio/src/streaming_entry_reader.rs new file mode 100644 index 0000000..57ef240 --- /dev/null +++ b/rc-zip-tokio/src/streaming_entry_reader.rs @@ -0,0 +1,179 @@ +use oval::Buffer; +use pin_project_lite::pin_project; +use rc_zip::{ + error::{Error, FormatError}, + fsm::{EntryFsm, FsmResult}, + parse::Entry, +}; +use std::{io, pin::Pin, task}; +use tokio::io::{AsyncRead, AsyncReadExt, ReadBuf}; +use tracing::trace; + +pin_project! { + /// Reads a zip entry based on a local header. Some information is missing, + /// not all name encodings may work, and only by reading it in its entirety + /// can you move on to the next entry. + /// + /// However, it only requires an [AsyncRead], and does not need to seek. + pub struct StreamingEntryReader { + entry: Entry, + #[pin] + rd: R, + state: State, + } +} + +#[derive(Default)] +#[allow(clippy::large_enum_variant)] +enum State { + Reading { + fsm: EntryFsm, + }, + Finished { + /// remaining buffer for next entry + remain: Buffer, + }, + #[default] + Transition, +} + +impl StreamingEntryReader +where + R: AsyncRead, +{ + pub(crate) fn new(fsm: EntryFsm, entry: Entry, rd: R) -> Self { + Self { + entry, + rd, + state: State::Reading { fsm }, + } + } +} + +impl AsyncRead for StreamingEntryReader +where + R: AsyncRead, +{ + fn poll_read( + mut self: Pin<&mut Self>, + cx: &mut task::Context<'_>, + buf: &mut ReadBuf<'_>, + ) -> task::Poll> { + let this = self.as_mut().project(); + + trace!("reading from streaming entry reader"); + + match std::mem::take(this.state) { + State::Reading { mut fsm } => { + if fsm.wants_read() { + trace!("fsm wants read"); + let mut buf = ReadBuf::new(fsm.space()); + match this.rd.poll_read(cx, &mut buf) { + task::Poll::Ready(res) => res?, + task::Poll::Pending => { + *this.state = State::Reading { fsm }; + return task::Poll::Pending; + } + } + let n = buf.filled().len(); + + trace!("giving fsm {} bytes from rd", n); + fsm.fill(n); + } else { + trace!("fsm does not want read"); + } + + match fsm.process(buf.initialize_unfilled())? { + FsmResult::Continue((fsm, outcome)) => { + trace!("fsm wants to continue"); + *this.state = State::Reading { fsm }; + + if outcome.bytes_written > 0 { + trace!("bytes have been written"); + buf.advance(outcome.bytes_written); + } else if outcome.bytes_read == 0 { + trace!("no bytes have been written or read"); + // that's EOF, baby! + } else { + trace!("read some bytes, hopefully will write more later"); + // loop, it happens + return self.poll_read(cx, buf); + } + } + FsmResult::Done(remain) => { + *this.state = State::Finished { remain }; + + // neat! + } + } + } + State::Finished { remain } => { + // wait for them to call finish + *this.state = State::Finished { remain }; + } + State::Transition => unreachable!(), + } + Ok(()).into() + } +} + +impl StreamingEntryReader +where + R: AsyncRead + Unpin, +{ + /// Return entry information for this reader + #[inline(always)] + pub fn entry(&self) -> &Entry { + &self.entry + } + + /// Finish reading this entry, returning the next streaming entry reader, if + /// any. This panics if the entry is not fully read. + /// + /// If this returns None, there's no entries left. + pub async fn finish(mut self) -> Result>, Error> { + trace!("finishing streaming entry reader"); + + if matches!(self.state, State::Reading { .. }) { + // this should transition to finished if there's no data + _ = self.read(&mut [0u8; 1]).await?; + } + + match self.state { + State::Reading { .. } => { + panic!("entry not fully read"); + } + State::Finished { remain } => { + // parse the next entry, if any + let mut fsm = EntryFsm::new(None, Some(remain)); + + loop { + if fsm.wants_read() { + let n = self.rd.read(fsm.space()).await?; + trace!("read {} bytes into buf for first zip entry", n); + fsm.fill(n); + } + + match fsm.process_till_header() { + Ok(Some(entry)) => { + let entry = entry.clone(); + return Ok(Some(StreamingEntryReader::new(fsm, entry, self.rd))); + } + Ok(None) => { + // needs more turns + } + Err(e) => match e { + Error::Format(FormatError::InvalidLocalHeader) => { + // we probably reached the end of central directory! + // TODO: we should probably check for the end of central directory + return Ok(None); + } + _ => return Err(e), + }, + } + } + } + State::Transition => unreachable!(), + } + } +} diff --git a/rc-zip-tokio/tests/integration_tests.rs b/rc-zip-tokio/tests/integration_tests.rs index 7cf49c2..35e8c90 100644 --- a/rc-zip-tokio/tests/integration_tests.rs +++ b/rc-zip-tokio/tests/integration_tests.rs @@ -1,26 +1,29 @@ use positioned_io::RandomAccessFile; use rc_zip::{ - corpus::{self, zips_dir, Case}, + corpus::{self, zips_dir, Case, Files}, error::Error, parse::Archive, }; -use rc_zip_tokio::{AsyncArchive, HasAsyncCursor, ReadZipAsync}; +use rc_zip_tokio::{ArchiveHandle, HasCursor, ReadZip, ReadZipStreaming}; +use tokio::io::AsyncReadExt; use std::sync::Arc; -async fn check_case(test: &Case, archive: Result, Error>) { +async fn check_case(test: &Case, archive: Result, Error>) { corpus::check_case(test, archive.as_ref().map(|ar| -> &Archive { ar })); let archive = match archive { Ok(archive) => archive, Err(_) => return, }; - for file in &test.files { - let entry = archive - .by_name(file.name) - .unwrap_or_else(|| panic!("entry {} should exist", file.name)); + if let Files::ExhaustiveList(files) = &test.files { + for file in files { + let entry = archive + .by_name(file.name) + .unwrap_or_else(|| panic!("entry {} should exist", file.name)); - corpus::check_file_against(file, &entry, &entry.bytes().await.unwrap()[..]) + corpus::check_file_against(file, &entry, &entry.bytes().await.unwrap()[..]) + } } } @@ -28,14 +31,14 @@ async fn check_case(test: &Case, archive: Result entry = next, + None => break, + } + } - check_case(&case, archive).await + drop(guarded_path) } } diff --git a/rc-zip/Cargo.toml b/rc-zip/Cargo.toml index c2a46be..8551ceb 100644 --- a/rc-zip/Cargo.toml +++ b/rc-zip/Cargo.toml @@ -33,9 +33,11 @@ deflate64 = { version = "0.1.7", optional = true } bzip2 = { version = "0.4.4", optional = true } lzma-rs = { version = "0.3.0", optional = true, features = ["stream"] } zstd = { version = "0.13.0", optional = true } +ownable = "0.6.2" +temp-dir = { version = "0.1.12", optional = true } [features] -corpus = [] +corpus = ["dep:temp-dir", "dep:bzip2"] deflate = ["dep:miniz_oxide"] deflate64 = ["dep:deflate64"] bzip2 = ["dep:bzip2"] diff --git a/rc-zip/src/corpus/mod.rs b/rc-zip/src/corpus/mod.rs index 2cc8cc9..8ae5229 100644 --- a/rc-zip/src/corpus/mod.rs +++ b/rc-zip/src/corpus/mod.rs @@ -2,39 +2,82 @@ //! A corpus of zip files for testing. -use std::path::PathBuf; +use std::{fs::File, path::PathBuf}; use chrono::{DateTime, FixedOffset, TimeZone, Timelike, Utc}; +use temp_dir::TempDir; use crate::{ encoding::Encoding, error::Error, - parse::{Archive, EntryContents, StoredEntry}, + parse::{Archive, Entry, EntryKind}, }; pub struct Case { pub name: &'static str, pub expected_encoding: Option, pub comment: Option<&'static str>, - pub files: Vec, + pub files: Files, pub error: Option, } +pub enum Files { + ExhaustiveList(Vec), + NumFiles(usize), +} + +impl Files { + fn len(&self) -> usize { + match self { + Self::ExhaustiveList(list) => list.len(), + Self::NumFiles(n) => *n, + } + } +} + impl Default for Case { fn default() -> Self { Self { name: "test.zip", expected_encoding: None, comment: None, - files: vec![], + files: Files::NumFiles(0), error: None, } } } +/// This path may disappear on drop (if the zip is bz2-compressed), so be +/// careful +pub struct GuardedPath { + pub path: PathBuf, + _guard: Option, +} + impl Case { - pub fn absolute_path(&self) -> PathBuf { - zips_dir().join(self.name) + pub fn absolute_path(&self) -> GuardedPath { + let path = zips_dir().join(self.name); + if let Some(dec_name) = self.name.strip_suffix(".bz2") { + let dir = TempDir::new().unwrap(); + let dec_path = dir.path().join(dec_name); + std::io::copy( + &mut File::open(&path).unwrap(), + &mut bzip2::write::BzDecoder::new(File::create(&dec_path).unwrap()), + ) + .unwrap(); + tracing::trace!("decompressed {} to {}", path.display(), dec_path.display()); + GuardedPath { + path: dec_path, + _guard: Some(dir), + } + } else { + GuardedPath { path, _guard: None } + } + } + + pub fn bytes(&self) -> Vec { + let gp = self.absolute_path(); + std::fs::read(gp.path).unwrap() } } @@ -92,21 +135,21 @@ pub fn test_cases() -> Vec { vec![ Case { name: "zip64.zip", - files: vec![CaseFile { + files: Files::ExhaustiveList(vec![CaseFile { name: "README", content: FileContent::Bytes( "This small file is in ZIP64 format.\n".as_bytes().into(), ), modified: Some(date((2012, 8, 10), (14, 33, 32), 0, time_zone(0)).unwrap()), mode: Some(0o644), - }], + }]), ..Default::default() }, Case { name: "test.zip", comment: Some("This is a zipfile comment."), expected_encoding: Some(Encoding::Utf8), - files: vec![ + files: Files::ExhaustiveList(vec![ CaseFile { name: "test.txt", content: FileContent::Bytes("This is a test text file.\n".as_bytes().into()), @@ -119,22 +162,22 @@ pub fn test_cases() -> Vec { modified: Some(date((2010, 9, 5), (15, 52, 58), 0, time_zone(10)).unwrap()), mode: Some(0o644), }, - ], + ]), ..Default::default() }, Case { name: "cp-437.zip", expected_encoding: Some(Encoding::Cp437), - files: vec![CaseFile { + files: Files::ExhaustiveList(vec![CaseFile { name: "français", ..Default::default() - }], + }]), ..Default::default() }, Case { name: "shift-jis.zip", expected_encoding: Some(Encoding::ShiftJis), - files: vec![ + files: Files::ExhaustiveList(vec![ CaseFile { name: "should-be-jis/", ..Default::default() @@ -143,42 +186,48 @@ pub fn test_cases() -> Vec { name: "should-be-jis/ot_運命のワルツネぞなぞ小さな楽しみ遊びま.longboi", ..Default::default() }, - ], + ]), ..Default::default() }, Case { name: "utf8-winrar.zip", expected_encoding: Some(Encoding::Utf8), - files: vec![CaseFile { + files: Files::ExhaustiveList(vec![CaseFile { name: "世界", content: FileContent::Bytes(vec![]), modified: Some(date((2017, 11, 6), (21, 9, 27), 867862500, time_zone(0)).unwrap()), ..Default::default() - }], + }]), + ..Default::default() + }, + Case { + name: "wine-zeroed.zip.bz2", + expected_encoding: Some(Encoding::Utf8), + files: Files::NumFiles(11372), ..Default::default() }, #[cfg(feature = "lzma")] Case { name: "found-me-lzma.zip", expected_encoding: Some(Encoding::Utf8), - files: vec![CaseFile { + files: Files::ExhaustiveList(vec![CaseFile { name: "found-me.txt", content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), ..Default::default() - }], + }]), ..Default::default() }, #[cfg(feature = "deflate64")] Case { name: "found-me-deflate64.zip", expected_encoding: Some(Encoding::Utf8), - files: vec![CaseFile { + files: Files::ExhaustiveList(vec![CaseFile { name: "found-me.txt", content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), ..Default::default() - }], + }]), ..Default::default() }, // same with bzip2 @@ -186,12 +235,12 @@ pub fn test_cases() -> Vec { Case { name: "found-me-bzip2.zip", expected_encoding: Some(Encoding::Utf8), - files: vec![CaseFile { + files: Files::ExhaustiveList(vec![CaseFile { name: "found-me.txt", content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), modified: Some(date((2024, 1, 26), (16, 14, 35), 46003100, time_zone(0)).unwrap()), ..Default::default() - }], + }]), ..Default::default() }, // same with zstd @@ -199,21 +248,29 @@ pub fn test_cases() -> Vec { Case { name: "found-me-zstd.zip", expected_encoding: Some(Encoding::Utf8), - files: vec![CaseFile { + files: Files::ExhaustiveList(vec![CaseFile { name: "found-me.txt", content: FileContent::Bytes("Oh no, you found me\n".repeat(5000).into()), modified: Some(date((2024, 1, 31), (6, 10, 25), 800491400, time_zone(0)).unwrap()), ..Default::default() - }], + }]), ..Default::default() }, ] } -pub fn check_case(test: &Case, archive: Result<&Archive, &Error>) { - let case_bytes = std::fs::read(test.absolute_path()).unwrap(); +pub fn streaming_test_cases() -> Vec { + vec![Case { + name: "meta.zip", + files: Files::NumFiles(0), + ..Default::default() + }] +} + +pub fn check_case(case: &Case, archive: Result<&Archive, &Error>) { + let case_bytes = case.bytes(); - if let Some(expected) = &test.error { + if let Some(expected) = &case.error { let actual = match archive { Err(e) => e, Ok(_) => panic!("should have failed"), @@ -223,37 +280,40 @@ pub fn check_case(test: &Case, archive: Result<&Archive, &Error>) { assert_eq!(expected, actual); return; } - let archive = archive.unwrap(); + let archive = archive.unwrap_or_else(|e| { + panic!( + "{} should have succeeded, but instead: {e:?} ({e})", + case.name + ) + }); assert_eq!(case_bytes.len() as u64, archive.size()); - if let Some(expected) = test.comment { - assert_eq!(expected, archive.comment().expect("should have comment")) + if let Some(expected) = case.comment { + assert_eq!(expected, archive.comment()) } - if let Some(exp_encoding) = test.expected_encoding { + if let Some(exp_encoding) = case.expected_encoding { assert_eq!(archive.encoding(), exp_encoding); } assert_eq!( - test.files.len(), + case.files.len(), archive.entries().count(), "{} should have {} entries files", - test.name, - test.files.len() + case.name, + case.files.len() ); // then each implementation should check individual files } -pub fn check_file_against(file: &CaseFile, entry: &StoredEntry, actual_bytes: &[u8]) { +pub fn check_file_against(file: &CaseFile, entry: &Entry, actual_bytes: &[u8]) { if let Some(expected) = file.modified { assert_eq!( - expected, - entry.modified(), + expected, entry.modified, "entry {} should have modified = {:?}", - entry.name(), - expected + entry.name, expected ) } @@ -262,10 +322,10 @@ pub fn check_file_against(file: &CaseFile, entry: &StoredEntry, actual_bytes: &[ } // I have honestly yet to see a zip file _entry_ with a comment. - assert!(entry.comment().is_none()); + assert!(entry.comment.is_empty()); - match entry.contents() { - EntryContents::File => { + match entry.kind() { + EntryKind::File => { match &file.content { FileContent::Unchecked => { // ah well @@ -283,7 +343,7 @@ pub fn check_file_against(file: &CaseFile, entry: &StoredEntry, actual_bytes: &[ } } } - EntryContents::Symlink | EntryContents::Directory => { + EntryKind::Symlink | EntryKind::Directory => { assert!(matches!(file.content, FileContent::Unchecked)); } } diff --git a/rc-zip/src/format/archive.rs b/rc-zip/src/format/archive.rs deleted file mode 100644 index dc2bb67..0000000 --- a/rc-zip/src/format/archive.rs +++ /dev/null @@ -1,326 +0,0 @@ -use crate::format::*; -use num_enum::{FromPrimitive, IntoPrimitive}; - -/// An Archive contains general information about a zip files, -/// along with a list of [entries][StoredEntry]. -/// -/// It is obtained via an [ArchiveReader](crate::reader::ArchiveReader), or via a higher-level API -/// like the [ReadZip](crate::reader::sync::ReadZip) trait. -pub struct Archive { - pub(crate) size: u64, - pub(crate) encoding: Encoding, - pub(crate) entries: Vec, - pub(crate) comment: Option, -} - -impl Archive { - /// The size of .zip file that was read, in bytes. - pub fn size(&self) -> u64 { - self.size - } - - /// Iterate over all files in this zip, read from the central directory. - pub fn entries(&self) -> impl Iterator { - self.entries.iter() - } - - /// Attempts to look up an entry by name. This is usually a bad idea, - /// as names aren't necessarily normalized in zip archives. - pub fn by_name>(&self, name: N) -> Option<&StoredEntry> { - self.entries.iter().find(|&x| x.name() == name.as_ref()) - } - - /// Returns the detected character encoding for text fields - /// (names, comments) inside this zip archive. - pub fn encoding(&self) -> Encoding { - self.encoding - } - - /// Returns the comment for this archive, if any. When reading - /// a zip file with an empty comment field, this will return None. - pub fn comment(&self) -> Option<&String> { - self.comment.as_ref() - } -} - -/// Describes a zip archive entry (a file, a directory, a symlink) -/// -/// `Entry` contains normalized metadata fields, that can be set when -/// writing a zip archive. Additional metadata, along with the information -/// required to extract an entry, are available in [StoredEntry][] instead. -#[derive(Clone)] -pub struct Entry { - /// Name of the file - /// Must be a relative path, not start with a drive letter (e.g. C:), - /// and must use forward slashes instead of back slashes - pub name: String, - - /// Compression method - /// - /// See [Method][] for more details. - pub method: Method, - - /// Comment is any arbitrary user-defined string shorter than 64KiB - pub comment: Option, - - /// Modified timestamp - pub modified: chrono::DateTime, - - /// Created timestamp - pub created: Option>, - - /// Accessed timestamp - pub accessed: Option>, -} - -/// An entry as stored into an Archive. Contains additional metadata and offset information. -/// -/// Whereas [Entry][] is archive-independent, [StoredEntry][] contains information that is tied to -/// a specific archive. -/// -/// When reading archives, one deals with a list of [StoredEntry][], whereas when writing one, one -/// typically only specifies an [Entry][] and provides the entry's contents: fields like the CRC32 -/// hash, uncompressed size, and compressed size are derived automatically from the input. -#[derive(Clone)] -pub struct StoredEntry { - /// Archive-independent information - /// - /// This contains the entry's name, timestamps, comment, compression method. - pub entry: Entry, - - /// Offset of the local file header in the zip file - /// - /// ```text - /// [optional non-zip data] - /// [local file header 1] <------ header_offset points here - /// [encryption header 1] - /// [file data 1] - /// [data descriptor 1] - /// ... - /// [central directory] - /// [optional zip64 end of central directory info] - /// [end of central directory record] - /// ``` - pub header_offset: u64, - - /// External attributes (zip) - pub external_attrs: u32, - - /// Version of zip supported by the tool that crated this archive. - pub creator_version: Version, - - /// Version of zip needed to extract this archive. - pub reader_version: Version, - - /// General purpose bit flag - /// - /// In the zip format, the most noteworthy flag (bit 11) is for UTF-8 names. - /// Other flags can indicate: encryption (unsupported), various compression - /// settings (depending on the [Method] used). - /// - /// For LZMA, general-purpose bit 1 denotes the EOS marker. - pub flags: u16, - - /// Unix user ID - /// - /// Only present if a Unix extra field or New Unix extra field was found. - pub uid: Option, - - /// Unix group ID - /// - /// Only present if a Unix extra field or New Unix extra field was found. - pub gid: Option, - - /// File mode - pub mode: Mode, - - /// Any extra fields recognized while parsing the file. - /// - /// Most of these should be normalized and accessible as other fields, - /// but they are also made available here raw. - pub extra_fields: Vec, - - pub inner: StoredEntryInner, -} - -#[derive(Clone, Copy, Debug)] -pub struct StoredEntryInner { - /// CRC-32 hash as found in the central directory. - /// - /// Note that this may be zero, and the actual CRC32 might be in the local header, or (more - /// commonly) in the data descriptor instead. - pub crc32: u32, - - /// Size in bytes, after compression - pub compressed_size: u64, - - /// Size in bytes, before compression - /// - /// This will be zero for directories. - pub uncompressed_size: u64, - - /// True if this entry was read from a zip64 archive - pub is_zip64: bool, -} - -impl StoredEntry { - /// Returns the entry's name. See also - /// [sanitized_name()](StoredEntry::sanitized_name), which returns a - /// sanitized version of the name. - /// - /// This should be a relative path, separated by `/`. However, there are zip - /// files in the wild with all sorts of evil variants, so, be conservative - /// in what you accept. - pub fn name(&self) -> &str { - self.entry.name.as_ref() - } - - /// Returns a sanitized version of the entry's name, if it - /// seems safe. In particular, if this method feels like the - /// entry name is trying to do a zip slip (cf. - /// ), it'll return - /// None. - /// - /// Other than that, it will strip any leading slashes on non-Windows OSes. - pub fn sanitized_name(&self) -> Option<&str> { - let name = self.name(); - - // refuse entries with traversed/absolute path to mitigate zip slip - if name.contains("..") { - return None; - } - - #[cfg(windows)] - { - if name.contains(":\\") || name.starts_with("\\") { - return None; - } - Some(name) - } - - #[cfg(not(windows))] - { - // strip absolute prefix on entries pointing to root path - let mut entry_chars = name.chars(); - let mut name = name; - while name.starts_with('/') { - entry_chars.next(); - name = entry_chars.as_str() - } - Some(name) - } - } - - /// The entry's comment, if any. - /// - /// When reading a zip file, an empty comment results in None. - pub fn comment(&self) -> Option<&str> { - self.entry.comment.as_ref().map(|x| x.as_ref()) - } - - /// The compression method used for this entry - #[inline(always)] - pub fn method(&self) -> Method { - self.entry.method - } - - /// This entry's "last modified" timestamp - with caveats - /// - /// Due to the history of the ZIP file format, this may be inaccurate. It may be offset - /// by a few hours, if there is no extended timestamp information. It may have a resolution - /// as low as two seconds, if only MSDOS timestamps are present. It may default to the Unix - /// epoch, if something went really wrong. - /// - /// If you're reading this after the year 2038, or after the year 2108, godspeed. - #[inline(always)] - pub fn modified(&self) -> DateTime { - self.entry.modified - } - - /// This entry's "created" timestamp, if available. - /// - /// See [StoredEntry::modified()] for caveats. - #[inline(always)] - pub fn created(&self) -> Option<&DateTime> { - self.entry.created.as_ref() - } - - /// This entry's "last accessed" timestamp, if available. - /// - /// See [StoredEntry::modified()] for caveats. - #[inline(always)] - pub fn accessed(&self) -> Option<&DateTime> { - self.entry.accessed.as_ref() - } -} - -/// The contents of an entry: a directory, a file, or a symbolic link. -#[derive(Debug)] -pub enum EntryContents { - Directory, - File, - Symlink, -} - -impl StoredEntry { - pub fn contents(&self) -> EntryContents { - if self.mode.has(Mode::SYMLINK) { - EntryContents::Symlink - } else if self.mode.has(Mode::DIR) { - EntryContents::Directory - } else { - EntryContents::File - } - } -} - -/// Compression method used for a file entry. -/// -/// In archives that follow [ISO/IEC 21320-1:2015](https://www.iso.org/standard/60101.html), only -/// [Store][Method::Store] and [Deflate][Method::Deflate] should be used. -/// -/// However, in the wild, it is not too uncommon to encounter [Bzip2][Method::Bzip2], -/// [Lzma][Method::Lzma] or others. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IntoPrimitive, FromPrimitive)] -#[repr(u16)] -pub enum Method { - /// No compression is applied - Store = 0, - - /// [DEFLATE (RFC 1951)](https://www.ietf.org/rfc/rfc1951.txt) - Deflate = 8, - - /// [DEFLATE64](https://deflate64.com/) - Deflate64 = 9, - - /// [BZIP-2](https://github.com/dsnet/compress/blob/master/doc/bzip2-format.pdf) - Bzip2 = 12, - - /// [LZMA](https://github.com/jljusten/LZMA-SDK/blob/master/DOC/lzma-specification.txt) - Lzma = 14, - - /// [zstd](https://datatracker.ietf.org/doc/html/rfc8878) - Zstd = 93, - - /// [MP3](https://www.iso.org/obp/ui/#iso:std:iso-iec:11172:-3:ed-1:v1:en) - Mp3 = 94, - - /// [XZ](https://tukaani.org/xz/xz-file-format.txt) - Xz = 95, - - /// [JPEG](https://jpeg.org/jpeg/) - Jpeg = 96, - - /// [WavPack](https://www.wavpack.com/) - WavPack = 97, - - /// [PPMd](https://en.wikipedia.org/wiki/Prediction_by_partial_matching) - Ppmd = 98, - - /// AE-x encryption marker (see Appendix E of appnote) - Aex = 99, - - /// A compression method that isn't recognized by this crate. - #[num_enum(catch_all)] - Unrecognized(u16), -} diff --git a/rc-zip/src/format/date_time.rs b/rc-zip/src/format/date_time.rs deleted file mode 100644 index baeee9a..0000000 --- a/rc-zip/src/format/date_time.rs +++ /dev/null @@ -1,104 +0,0 @@ -use chrono::{ - offset::{LocalResult, TimeZone, Utc}, - DateTime, Timelike, -}; -use std::fmt; -use winnow::{ - binary::{le_u16, le_u64}, - seq, PResult, Parser, Partial, -}; - -/// A timestamp in MS-DOS format -/// -/// Represents dates from year 1980 to 2180, with 2 second precision. -#[derive(Clone, Copy, Eq, PartialEq)] -pub struct MsdosTimestamp { - pub time: u16, - pub date: u16, -} - -impl fmt::Debug for MsdosTimestamp { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self.to_datetime() { - Some(dt) => write!(f, "MsdosTimestamp({})", dt), - None => write!(f, "MsdosTimestamp(?)"), - } - } -} - -impl MsdosTimestamp { - pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - seq! {Self { - time: le_u16, - date: le_u16, - }} - .parse_next(i) - } - - /// Attempts to convert to a chrono UTC date time - pub fn to_datetime(&self) -> Option> { - // see https://docs.microsoft.com/en-us/windows/win32/api/winbase/nf-winbase-dosdatetimetofiletime - let date = match { - // bits 0-4: day of the month (1-31) - let d = (self.date & 0b1_1111) as u32; - // bits 5-8: month (1 = january, 2 = february and so on) - let m = ((self.date >> 5) & 0b1111) as u32; - // bits 9-15: year offset from 1980 - let y = ((self.date >> 9) + 1980) as i32; - Utc.with_ymd_and_hms(y, m, d, 0, 0, 0) - } { - LocalResult::Single(date) => date, - _ => return None, - }; - - // bits 0-4: second divided by 2 - let s = (self.time & 0b1_1111) as u32 * 2; - // bits 5-10: minute (0-59) - let m = (self.time >> 5 & 0b11_1111) as u32; - // bits 11-15: hour (0-23 on a 24-hour clock) - let h = (self.time >> 11) as u32; - date.with_hour(h)?.with_minute(m)?.with_second(s) - } -} - -/// A timestamp in NTFS format. -#[derive(Clone, Copy, Eq, PartialEq)] -pub struct NtfsTimestamp { - pub timestamp: u64, -} - -impl fmt::Debug for NtfsTimestamp { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self.to_datetime() { - Some(dt) => write!(f, "NtfsTimestamp({})", dt), - None => write!(f, "NtfsTimestamp(?)"), - } - } -} - -impl NtfsTimestamp { - /// Parse an MS-DOS timestamp from a byte slice - pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - le_u64.map(|timestamp| Self { timestamp }).parse_next(i) - } - - /// Attempts to convert to a chrono UTC date time - pub fn to_datetime(&self) -> Option> { - // windows timestamp resolution - let ticks_per_second = 10_000_000; - let secs = (self.timestamp / ticks_per_second) as i64; - let nsecs = ((self.timestamp % ticks_per_second) * 100) as u32; - let epoch = Utc.with_ymd_and_hms(1601, 1, 1, 0, 0, 0).single()?; - match Utc.timestamp_opt(epoch.timestamp() + secs, nsecs) { - LocalResult::Single(date) => Some(date), - _ => None, - } - } -} - -pub(crate) fn zero_datetime() -> chrono::DateTime { - chrono::DateTime::from_naive_utc_and_offset( - chrono::naive::NaiveDateTime::from_timestamp_opt(0, 0).unwrap(), - chrono::offset::Utc, - ) -} diff --git a/rc-zip/src/format/directory_header.rs b/rc-zip/src/format/directory_header.rs deleted file mode 100644 index ff73cf6..0000000 --- a/rc-zip/src/format/directory_header.rs +++ /dev/null @@ -1,244 +0,0 @@ -use crate::{encoding, error::*, format::*}; -use chrono::offset::TimeZone; -use tracing::trace; -use winnow::{ - binary::{le_u16, le_u32}, - prelude::PResult, - token::tag, - Parser, Partial, -}; - -/// 4.3.12 Central directory structure: File header -pub struct DirectoryHeader { - // version made by - pub creator_version: Version, - // version needed to extract - pub reader_version: Version, - // general purpose bit flag - pub flags: u16, - // compression method - pub method: u16, - // last mod file datetime - pub modified: MsdosTimestamp, - // crc32 - pub crc32: u32, - // compressed size - pub compressed_size: u32, - // uncompressed size - pub uncompressed_size: u32, - // disk number start - pub disk_nbr_start: u16, - // internal file attributes - pub internal_attrs: u16, - // external file attributes - pub external_attrs: u32, - // relative offset of local header - pub header_offset: u32, - - // name - pub name: ZipString, - // extra - pub extra: ZipBytes, // comment - pub comment: ZipString, -} - -impl DirectoryHeader { - const SIGNATURE: &'static str = "PK\x01\x02"; - - pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - _ = tag(Self::SIGNATURE).parse_next(i)?; - let creator_version = Version::parser.parse_next(i)?; - let reader_version = Version::parser.parse_next(i)?; - let flags = le_u16.parse_next(i)?; - let method = le_u16.parse_next(i)?; - let modified = MsdosTimestamp::parser.parse_next(i)?; - let crc32 = le_u32.parse_next(i)?; - let compressed_size = le_u32.parse_next(i)?; - let uncompressed_size = le_u32.parse_next(i)?; - let name_len = le_u16.parse_next(i)?; - let extra_len = le_u16.parse_next(i)?; - let comment_len = le_u16.parse_next(i)?; - let disk_nbr_start = le_u16.parse_next(i)?; - let internal_attrs = le_u16.parse_next(i)?; - let external_attrs = le_u32.parse_next(i)?; - let header_offset = le_u32.parse_next(i)?; - - let name = ZipString::parser(name_len).parse_next(i)?; - let extra = ZipBytes::parser(extra_len).parse_next(i)?; - let comment = ZipString::parser(comment_len).parse_next(i)?; - - Ok(Self { - creator_version, - reader_version, - flags, - method, - modified, - crc32, - compressed_size, - uncompressed_size, - disk_nbr_start, - internal_attrs, - external_attrs, - header_offset, - name, - extra, - comment, - }) - } -} - -impl DirectoryHeader { - pub fn is_non_utf8(&self) -> bool { - let (valid1, require1) = encoding::detect_utf8(&self.name.0[..]); - let (valid2, require2) = encoding::detect_utf8(&self.comment.0[..]); - if !valid1 || !valid2 { - // definitely not utf-8 - return true; - } - - if !require1 && !require2 { - // name and comment only use single-byte runes that overlap with UTF-8 - return false; - } - - // Might be UTF-8, might be some other encoding; preserve existing flag. - // Some ZIP writers use UTF-8 encoding without setting the UTF-8 flag. - // Since it is impossible to always distinguish valid UTF-8 from some - // other encoding (e.g., GBK or Shift-JIS), we trust the flag. - self.flags & 0x800 == 0 - } - - pub fn as_stored_entry( - &self, - is_zip64: bool, - encoding: Encoding, - global_offset: u64, - ) -> Result { - let mut comment: Option = None; - if let Some(comment_field) = self.comment.clone().into_option() { - comment = Some(encoding.decode(&comment_field.0)?); - } - - let name = encoding.decode(&self.name.0)?; - - let mut compressed_size = self.compressed_size as u64; - let mut uncompressed_size = self.uncompressed_size as u64; - let mut header_offset = self.header_offset as u64 + global_offset; - - let mut modified: Option> = None; - let mut created: Option> = None; - let mut accessed: Option> = None; - - let mut uid: Option = None; - let mut gid: Option = None; - - let mut extra_fields: Vec = Vec::new(); - - let settings = ExtraFieldSettings { - needs_compressed_size: self.compressed_size == !0u32, - needs_uncompressed_size: self.uncompressed_size == !0u32, - needs_header_offset: self.header_offset == !0u32, - }; - - let mut slice = Partial::new(&self.extra.0[..]); - while !slice.is_empty() { - match ExtraField::mk_parser(settings).parse_next(&mut slice) { - Ok(ef) => { - match &ef { - ExtraField::Zip64(z64) => { - if let Some(n) = z64.uncompressed_size { - uncompressed_size = n; - } - if let Some(n) = z64.compressed_size { - compressed_size = n; - } - if let Some(n) = z64.header_offset { - header_offset = n; - } - } - ExtraField::Timestamp(ts) => { - modified = Utc.timestamp_opt(ts.mtime as i64, 0).single(); - } - ExtraField::Ntfs(nf) => { - for attr in &nf.attrs { - // note: other attributes are unsupported - if let NtfsAttr::Attr1(attr) = attr { - modified = attr.mtime.to_datetime(); - created = attr.ctime.to_datetime(); - accessed = attr.atime.to_datetime(); - } - } - } - ExtraField::Unix(uf) => { - modified = Utc.timestamp_opt(uf.mtime as i64, 0).single(); - if uid.is_none() { - uid = Some(uf.uid as u32); - } - if gid.is_none() { - gid = Some(uf.gid as u32); - } - } - ExtraField::NewUnix(uf) => { - uid = Some(uf.uid as u32); - gid = Some(uf.uid as u32); - } - _ => {} - }; - extra_fields.push(ef); - } - Err(e) => { - trace!("extra field error: {:#?}", e); - return Err(FormatError::InvalidExtraField.into()); - } - } - } - - let modified = match modified { - Some(m) => Some(m), - None => self.modified.to_datetime(), - }; - - let mut mode: Mode = match self.creator_version.host_system() { - HostSystem::Unix | HostSystem::Osx => UnixMode(self.external_attrs >> 16).into(), - HostSystem::WindowsNtfs | HostSystem::Vfat | HostSystem::MsDos => { - MsdosMode(self.external_attrs).into() - } - _ => Mode(0), - }; - if name.ends_with('/') { - // believe it or not, this is straight from the APPNOTE - mode |= Mode::DIR - }; - - Ok(StoredEntry { - entry: Entry { - name, - method: self.method.into(), - comment, - modified: modified.unwrap_or_else(zero_datetime), - created, - accessed, - }, - - creator_version: self.creator_version, - reader_version: self.reader_version, - flags: self.flags, - - inner: StoredEntryInner { - crc32: self.crc32, - compressed_size, - uncompressed_size, - is_zip64, - }, - header_offset, - - uid, - gid, - mode, - - extra_fields, - - external_attrs: self.external_attrs, - }) - } -} diff --git a/rc-zip/src/format/eocd.rs b/rc-zip/src/format/eocd.rs deleted file mode 100644 index cc1c7f5..0000000 --- a/rc-zip/src/format/eocd.rs +++ /dev/null @@ -1,263 +0,0 @@ -use crate::{error::*, format::*}; -use tracing::trace; -use winnow::{ - binary::{le_u16, le_u32, le_u64, length_take}, - seq, - token::tag, - PResult, Parser, Partial, -}; - -/// 4.3.16 End of central directory record: -#[derive(Debug)] -pub struct EndOfCentralDirectoryRecord { - /// number of this disk - pub disk_nbr: u16, - /// number of the disk with the start of the central directory - pub dir_disk_nbr: u16, - /// total number of entries in the central directory on this disk - pub dir_records_this_disk: u16, - /// total number of entries in the central directory - pub directory_records: u16, - // size of the central directory - pub directory_size: u32, - /// offset of start of central directory with respect to the starting disk number - pub directory_offset: u32, - /// .ZIP file comment - pub comment: ZipString, -} - -impl EndOfCentralDirectoryRecord { - /// Does not include comment size & comment data - const MIN_LENGTH: usize = 20; - const SIGNATURE: &'static str = "PK\x05\x06"; - - pub fn find_in_block(b: &[u8]) -> Option> { - for i in (0..(b.len() - Self::MIN_LENGTH + 1)).rev() { - let mut input = Partial::new(&b[i..]); - if let Ok(directory) = Self::parser.parse_next(&mut input) { - return Some(Located { - offset: i as u64, - inner: directory, - }); - } - } - None - } - - pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - let _ = tag(Self::SIGNATURE).parse_next(i)?; - seq! {Self { - disk_nbr: le_u16, - dir_disk_nbr: le_u16, - dir_records_this_disk: le_u16, - directory_records: le_u16, - directory_size: le_u32, - directory_offset: le_u32, - comment: length_take(le_u16).map(ZipString::from), - }} - .parse_next(i) - } -} - -/// 4.3.15 Zip64 end of central directory locator -#[derive(Debug)] -pub struct EndOfCentralDirectory64Locator { - /// number of the disk with the start of the zip64 end of central directory - pub dir_disk_number: u32, - /// relative offset of the zip64 end of central directory record - pub directory_offset: u64, - /// total number of disks - pub total_disks: u32, -} - -impl EndOfCentralDirectory64Locator { - pub const LENGTH: usize = 20; - const SIGNATURE: &'static str = "PK\x06\x07"; - - pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - _ = tag(Self::SIGNATURE).parse_next(i)?; - seq! {Self { - dir_disk_number: le_u32, - directory_offset: le_u64, - total_disks: le_u32, - }} - .parse_next(i) - } -} - -/// 4.3.14 Zip64 end of central directory record -#[derive(Debug)] -pub struct EndOfCentralDirectory64Record { - /// size of zip64 end of central directory record - pub record_size: u64, - /// version made by - pub creator_version: u16, - /// version needed to extract - pub reader_version: u16, - /// number of this disk - pub disk_nbr: u32, - /// number of the disk with the start of the central directory - pub dir_disk_nbr: u32, - // total number of entries in the central directory on this disk - pub dir_records_this_disk: u64, - // total number of entries in the central directory - pub directory_records: u64, - // size of the central directory - pub directory_size: u64, - // offset of the start of central directory with respect to the - // starting disk number - pub directory_offset: u64, -} - -impl EndOfCentralDirectory64Record { - const SIGNATURE: &'static str = "PK\x06\x06"; - - pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - _ = tag(Self::SIGNATURE).parse_next(i)?; - seq! {Self { - record_size: le_u64, - creator_version: le_u16, - reader_version: le_u16, - disk_nbr: le_u32, - dir_disk_nbr: le_u32, - dir_records_this_disk: le_u64, - directory_records: le_u64, - directory_size: le_u64, - directory_offset: le_u64, - }} - .parse_next(i) - } -} - -#[derive(Debug)] -pub struct Located { - pub offset: u64, - pub inner: T, -} - -impl std::ops::Deref for Located { - type Target = T; - fn deref(&self) -> &Self::Target { - &self.inner - } -} - -impl std::ops::DerefMut for Located { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.inner - } -} - -/// Coalesces zip and zip64 "end of central directory" record info -pub struct EndOfCentralDirectory { - pub dir: Located, - pub dir64: Option>, - pub global_offset: i64, -} - -impl EndOfCentralDirectory { - pub fn new( - size: u64, - dir: Located, - dir64: Option>, - ) -> Result { - let mut res = Self { - dir, - dir64, - global_offset: 0, - }; - - // - // Pure .zip files look like this: - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // <------directory_size-----> - // [ Data 1 ][ Data 2 ][ Central directory ][ ??? ] - // ^ ^ ^ - // 0 directory_offset directory_end_offset - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // - // But there exist some valid zip archives with padding at the beginning, like so: - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // <--global_offset-> <------directory_size-----> - // [ Padding ][ Data 1 ][ Data 2 ][ Central directory ][ ??? ] - // ^ ^ ^ ^ - // 0 global_offset computed_directory_offset directory_end_offset - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // - // (e.g. https://www.icculus.org/mojosetup/ installers are ELF binaries with a .zip file appended) - // - // `directory_end_offfset` is found by scanning the file (so it accounts for padding), but - // `directory_offset` is found by reading a data structure (so it does not account for padding). - // If we just trusted `directory_offset`, we'd be reading the central directory at the wrong place: - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // <------directory_size-----> - // [ Padding ][ Data 1 ][ Data 2 ][ Central directory ][ ??? ] - // ^ ^ ^ - // 0 directory_offset - woops! directory_end_offset - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - let computed_directory_offset = res.located_directory_offset() - res.directory_size(); - - // did we find a valid offset? - if (0..size).contains(&computed_directory_offset) { - // that's different from the recorded one? - if computed_directory_offset != res.directory_offset() { - // then assume the whole file is offset - res.global_offset = - computed_directory_offset as i64 - res.directory_offset() as i64; - res.set_directory_offset(computed_directory_offset); - } - } - - // make sure directory_offset points to somewhere in our file - trace!( - "directory offset = {}, valid range = 0..{}", - res.directory_offset(), - size - ); - if !(0..size).contains(&res.directory_offset()) { - return Err(FormatError::DirectoryOffsetPointsOutsideFile.into()); - } - - Ok(res) - } - - pub fn located_directory_offset(&self) -> u64 { - match self.dir64.as_ref() { - Some(d64) => d64.offset, - None => self.dir.offset, - } - } - - pub fn directory_offset(&self) -> u64 { - match self.dir64.as_ref() { - Some(d64) => d64.directory_offset, - None => self.dir.directory_offset as u64, - } - } - - pub fn directory_size(&self) -> u64 { - match self.dir64.as_ref() { - Some(d64) => d64.directory_size, - None => self.dir.directory_size as u64, - } - } - - pub fn set_directory_offset(&mut self, offset: u64) { - match self.dir64.as_mut() { - Some(d64) => d64.directory_offset = offset, - None => self.dir.directory_offset = offset as u32, - }; - } - - pub fn directory_records(&self) -> u64 { - match self.dir64.as_ref() { - Some(d64) => d64.directory_records, - None => self.dir.directory_records as u64, - } - } - - pub fn comment(&self) -> &ZipString { - &self.dir.comment - } -} diff --git a/rc-zip/src/format/extra_field.rs b/rc-zip/src/format/extra_field.rs deleted file mode 100644 index c8d8627..0000000 --- a/rc-zip/src/format/extra_field.rs +++ /dev/null @@ -1,289 +0,0 @@ -use crate::format::*; -use tracing::trace; -use winnow::{ - binary::{le_u16, le_u32, le_u64, le_u8, length_take}, - combinator::{cond, opt, preceded, repeat_till}, - error::{ErrMode, ErrorKind, ParserError, StrContext}, - seq, - token::{tag, take}, - PResult, Parser, Partial, -}; -/// 4.4.28 extra field: (Variable) -pub(crate) struct ExtraFieldRecord<'a> { - pub(crate) tag: u16, - pub(crate) payload: &'a [u8], -} - -impl<'a> ExtraFieldRecord<'a> { - pub(crate) fn parser(i: &mut Partial<&'a [u8]>) -> PResult { - seq! {Self { - tag: le_u16, - payload: length_take(le_u16), - }} - .parse_next(i) - } -} - -// Useful because zip64 extended information extra field has fixed order *but* -// optional fields. From the appnote: -// -// If one of the size or offset fields in the Local or Central directory record -// is too small to hold the required data, a Zip64 extended information record -// is created. The order of the fields in the zip64 extended information record -// is fixed, but the fields MUST only appear if the corresponding Local or -// Central directory record field is set to 0xFFFF or 0xFFFFFFFF. -#[derive(Debug, Clone, Copy)] -pub(crate) struct ExtraFieldSettings { - pub(crate) needs_uncompressed_size: bool, - pub(crate) needs_compressed_size: bool, - pub(crate) needs_header_offset: bool, -} - -/// Information stored in the central directory header `extra` field -/// -/// This typically contains timestamps, file sizes and offsets, file mode, uid/gid, etc. -/// -/// See `extrafld.txt` in this crate's source distribution. -#[derive(Clone)] -pub enum ExtraField { - /// Zip64 extended information extra field - Zip64(ExtraZip64Field), - /// Extended timestamp - Timestamp(ExtraTimestampField), - /// UNIX & Info-Zip UNIX - Unix(ExtraUnixField), - /// New UNIX extra field - NewUnix(ExtraNewUnixField), - /// NTFS (Win9x/WinNT FileTimes) - Ntfs(ExtraNtfsField), - /// Unknown extra field, with tag - Unknown { tag: u16 }, -} - -impl ExtraField { - pub(crate) fn mk_parser( - settings: ExtraFieldSettings, - ) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult { - move |i| { - use ExtraField as EF; - let rec = ExtraFieldRecord::parser.parse_next(i)?; - trace!("parsing extra field record, tag {:04x}", rec.tag); - let payload = &mut Partial::new(rec.payload); - - let variant = match rec.tag { - ExtraZip64Field::TAG => opt(ExtraZip64Field::mk_parser(settings).map(EF::Zip64)) - .context(StrContext::Label("zip64")) - .parse_next(payload)?, - ExtraTimestampField::TAG => opt(ExtraTimestampField::parser.map(EF::Timestamp)) - .context(StrContext::Label("timestamp")) - .parse_next(payload)?, - ExtraNtfsField::TAG => { - opt(ExtraNtfsField::parse.map(EF::Ntfs)).parse_next(payload)? - } - ExtraUnixField::TAG | ExtraUnixField::TAG_INFOZIP => { - opt(ExtraUnixField::parser.map(EF::Unix)).parse_next(payload)? - } - ExtraNewUnixField::TAG => { - opt(ExtraNewUnixField::parser.map(EF::NewUnix)).parse_next(payload)? - } - _ => None, - } - .unwrap_or(EF::Unknown { tag: rec.tag }); - - Ok(variant) - } - } -} - -/// 4.5.3 -Zip64 Extended Information Extra Field (0x0001) -#[derive(Clone, Default)] -pub struct ExtraZip64Field { - pub uncompressed_size: Option, - pub compressed_size: Option, - pub header_offset: Option, -} - -impl ExtraZip64Field { - const TAG: u16 = 0x0001; - - pub(crate) fn mk_parser( - settings: ExtraFieldSettings, - ) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult { - move |i| { - // N.B: we ignore "disk start number" - seq! {Self { - uncompressed_size: cond(settings.needs_uncompressed_size, le_u64), - compressed_size: cond(settings.needs_compressed_size, le_u64), - header_offset: cond(settings.needs_header_offset, le_u64), - }} - .parse_next(i) - } - } -} - -/// Extended timestamp extra field -#[derive(Clone)] -pub struct ExtraTimestampField { - /// number of seconds since epoch - pub mtime: u32, -} - -impl ExtraTimestampField { - const TAG: u16 = 0x5455; - - fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - preceded( - // 1 byte of flags, if bit 0 is set, modification time is present - le_u8.verify(|x| x & 0b1 != 0), - seq! {Self { mtime: le_u32 }}, - ) - .parse_next(i) - } -} - -/// 4.5.7 -UNIX Extra Field (0x000d): -#[derive(Clone)] -pub struct ExtraUnixField { - /// file last access time - pub atime: u32, - /// file last modification time - pub mtime: u32, - /// file user id - pub uid: u16, - /// file group id - pub gid: u16, - /// variable length data field - pub data: ZipBytes, -} - -impl ExtraUnixField { - const TAG: u16 = 0x000d; - const TAG_INFOZIP: u16 = 0x5855; - - fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - let t_size = le_u16.parse_next(i)? - 12; - seq! {Self { - atime: le_u32, - mtime: le_u32, - uid: le_u16, - gid: le_u16, - data: ZipBytes::parser(t_size), - }} - .parse_next(i) - } -} - -/// Info-ZIP New Unix Extra Field: -/// ==================================== -/// -/// Currently stores Unix UIDs/GIDs up to 32 bits. -/// (Last Revision 20080509) -/// -/// ```text -/// Value Size Description -/// ----- ---- ----------- -/// 0x7875 Short tag for this extra block type ("ux") -/// TSize Short total data size for this block -/// Version 1 byte version of this extra field, currently 1 -/// UIDSize 1 byte Size of UID field -/// UID Variable UID for this entry -/// GIDSize 1 byte Size of GID field -/// GID Variable GID for this entry -/// ``` -#[derive(Clone)] -pub struct ExtraNewUnixField { - pub uid: u64, - pub gid: u64, -} - -impl ExtraNewUnixField { - const TAG: u16 = 0x7875; - - fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - let _ = tag("\x01").parse_next(i)?; - seq! {Self { - uid: Self::parse_variable_length_integer, - gid: Self::parse_variable_length_integer, - }} - .parse_next(i) - } - - fn parse_variable_length_integer(i: &mut Partial<&'_ [u8]>) -> PResult { - let slice = length_take(le_u8).parse_next(i)?; - if let Some(u) = match slice.len() { - 1 => Some(le_u8.parse_peek(slice)?.1 as u64), - 2 => Some(le_u16.parse_peek(slice)?.1 as u64), - 4 => Some(le_u32.parse_peek(slice)?.1 as u64), - 8 => Some(le_u64.parse_peek(slice)?.1), - _ => None, - } { - Ok(u) - } else { - Err(ErrMode::from_error_kind(i, ErrorKind::Alt)) - } - } -} - -/// 4.5.5 -NTFS Extra Field (0x000a): -#[derive(Clone)] -pub struct ExtraNtfsField { - pub attrs: Vec, -} - -impl ExtraNtfsField { - const TAG: u16 = 0x000a; - - fn parse(i: &mut Partial<&'_ [u8]>) -> PResult { - let _ = take(4_usize).parse_next(i)?; // reserved (unused) - seq! {Self { - // from the winnow docs: - // Parsers like repeat do not know when an eof is from insufficient - // data or the end of the stream, causing them to always report - // Incomplete. - // using repeat_till with eof combinator to work around this: - attrs: repeat_till(0.., NtfsAttr::parse, winnow::combinator::eof).map(|x| x.0), - }} - .parse_next(i) - } -} - -/// NTFS attribute for zip entries (mostly timestamps) -#[derive(Clone)] -pub enum NtfsAttr { - Attr1(NtfsAttr1), - Unknown { tag: u16 }, -} - -impl NtfsAttr { - fn parse(i: &mut Partial<&'_ [u8]>) -> PResult { - let tag = le_u16.parse_next(i)?; - trace!("parsing NTFS attribute, tag {:04x}", tag); - let payload = length_take(le_u16).parse_next(i)?; - - match tag { - 0x0001 => NtfsAttr1::parser - .parse_peek(Partial::new(payload)) - .map(|(_, attr)| NtfsAttr::Attr1(attr)), - _ => Ok(NtfsAttr::Unknown { tag }), - } - } -} - -#[derive(Clone)] -pub struct NtfsAttr1 { - pub mtime: NtfsTimestamp, - pub atime: NtfsTimestamp, - pub ctime: NtfsTimestamp, -} - -impl NtfsAttr1 { - fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - trace!("parsing NTFS attr 1, input len is {}", i.len()); - seq! {Self { - mtime: NtfsTimestamp::parser, - atime: NtfsTimestamp::parser, - ctime: NtfsTimestamp::parser, - }} - .parse_next(i) - } -} diff --git a/rc-zip/src/format/local.rs b/rc-zip/src/format/local.rs deleted file mode 100644 index 2c43c43..0000000 --- a/rc-zip/src/format/local.rs +++ /dev/null @@ -1,188 +0,0 @@ -use crate::{format::*, Error, UnsupportedError}; -use winnow::{ - binary::{le_u16, le_u32, le_u64, le_u8}, - combinator::opt, - error::{ContextError, ErrMode, ErrorKind, FromExternalError}, - seq, - token::tag, - PResult, Parser, Partial, -}; - -#[derive(Debug)] -/// 4.3.7 Local file header -pub struct LocalFileHeaderRecord { - /// version needed to extract - pub reader_version: Version, - /// general purpose bit flag - pub flags: u16, - /// compression method - pub method: Method, - /// last mod file datetime - pub modified: MsdosTimestamp, - /// crc-32 - pub crc32: u32, - /// compressed size - pub compressed_size: u32, - /// uncompressed size - pub uncompressed_size: u32, - // file name - pub name: ZipString, - // extra field - pub extra: ZipBytes, - - // method-specific fields - pub method_specific: MethodSpecific, -} - -#[derive(Debug)] -/// Method-specific properties following the local file header -pub enum MethodSpecific { - None, - Lzma(LzmaProperties), -} - -impl LocalFileHeaderRecord { - pub const SIGNATURE: &'static str = "PK\x03\x04"; - - pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - let _ = tag(Self::SIGNATURE).parse_next(i)?; - - let reader_version = Version::parser.parse_next(i)?; - let flags = le_u16.parse_next(i)?; - let method = le_u16.parse_next(i).map(Method::from)?; - let modified = MsdosTimestamp::parser.parse_next(i)?; - let crc32 = le_u32.parse_next(i)?; - let compressed_size = le_u32.parse_next(i)?; - let uncompressed_size = le_u32.parse_next(i)?; - - let name_len = le_u16.parse_next(i)?; - let extra_len = le_u16.parse_next(i)?; - - let name = ZipString::parser(name_len).parse_next(i)?; - let extra = ZipBytes::parser(extra_len).parse_next(i)?; - - let method_specific = match method { - Method::Lzma => { - let lzma_properties = LzmaProperties::parser.parse_next(i)?; - if let Err(e) = lzma_properties.error_if_unsupported() { - return Err(ErrMode::Cut(ContextError::from_external_error( - i, - ErrorKind::Verify, - e, - ))); - } - MethodSpecific::Lzma(lzma_properties) - } - _ => MethodSpecific::None, - }; - - Ok(Self { - reader_version, - flags, - method, - modified, - crc32, - compressed_size, - uncompressed_size, - name, - extra, - method_specific, - }) - } - - pub fn has_data_descriptor(&self) -> bool { - // 4.3.9.1 This descriptor MUST exist if bit 3 of the general - // purpose bit flag is set (see below). - self.flags & 0b1000 != 0 - } -} - -/// 4.3.9 Data descriptor: -#[derive(Debug)] -pub struct DataDescriptorRecord { - /// CRC32 checksum - pub crc32: u32, - /// Compressed size - pub compressed_size: u64, - /// Uncompressed size - pub uncompressed_size: u64, -} - -impl DataDescriptorRecord { - const SIGNATURE: &'static str = "PK\x07\x08"; - - pub fn mk_parser(is_zip64: bool) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult { - move |i| { - // From appnote.txt: - // - // 4.3.9.3 Although not originally assigned a signature, the value - // 0x08074b50 has commonly been adopted as a signature value for the - // data descriptor record. Implementers SHOULD be aware that ZIP files - // MAY be encountered with or without this signature marking data - // descriptors and SHOULD account for either case when reading ZIP files - // to ensure compatibility. - let _ = opt(tag(Self::SIGNATURE)).parse_next(i)?; - - if is_zip64 { - seq! {Self { - crc32: le_u32, - compressed_size: le_u64, - uncompressed_size: le_u64, - }} - .parse_next(i) - } else { - seq! {Self { - crc32: le_u32, - compressed_size: le_u32.map(|x| x as u64), - uncompressed_size: le_u32.map(|x| x as u64), - }} - .parse_next(i) - } - } - } -} - -/// 5.8.5 LZMA Properties header -#[derive(Debug)] -pub struct LzmaProperties { - /// major version - pub major: u8, - /// minor version - pub minor: u8, - /// properties size - pub properties_size: u16, -} - -impl LzmaProperties { - pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - seq! {Self { - major: le_u8, - minor: le_u8, - properties_size: le_u16, - }} - .parse_next(i) - } - - pub fn error_if_unsupported(&self) -> Result<(), Error> { - if (self.major, self.minor) != (2, 0) { - return Err(Error::Unsupported( - UnsupportedError::LzmaVersionUnsupported { - minor: self.minor, - major: self.major, - }, - )); - } - - const LZMA_PROPERTIES_SIZE: u16 = 5; - if self.properties_size != LZMA_PROPERTIES_SIZE { - return Err(Error::Unsupported( - UnsupportedError::LzmaPropertiesHeaderWrongSize { - expected: 5, - actual: self.properties_size, - }, - )); - } - - Ok(()) - } -} diff --git a/rc-zip/src/format/mod.rs b/rc-zip/src/format/mod.rs deleted file mode 100644 index 541edc8..0000000 --- a/rc-zip/src/format/mod.rs +++ /dev/null @@ -1,23 +0,0 @@ -//! Contain winnow parsers for most elements that make up a ZIP file, like -//! the end-of-central-directory record, local file headers, and central -//! directory headers. -//! -//! Everything in there is based off of the appnote, which you can find in the -//! source repository. - -pub use crate::encoding::Encoding; - -mod archive; -mod extra_field; -mod mode; -mod version; -pub use self::{archive::*, extra_field::*, mode::*, version::*}; - -mod date_time; -mod directory_header; -mod eocd; -mod local; -mod raw; -pub use self::{date_time::*, directory_header::*, eocd::*, local::*, raw::*}; - -use chrono::{offset::Utc, DateTime}; diff --git a/rc-zip/src/format/mode.rs b/rc-zip/src/format/mode.rs deleted file mode 100644 index 1baff51..0000000 --- a/rc-zip/src/format/mode.rs +++ /dev/null @@ -1,239 +0,0 @@ -use std::fmt; - -/// Mode represents a file's mode and permission bits. -/// The bits have the same definition on all systems, -/// but not all bits apply to all systems. -/// -/// It is modelled after Go's `os.FileMode`. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct Mode(pub u32); - -impl Mode { - /// d: is a directory - pub const DIR: Self = Self(1 << 31); - /// a: append-only - pub const APPEND: Self = Self(1 << 30); - /// l: exclusive use - pub const EXCLUSIVE: Self = Self(1 << 29); - /// T: temporary file; Plan 9 only - pub const TEMPORARY: Self = Self(1 << 28); - /// L: symbolic link - pub const SYMLINK: Self = Self(1 << 27); - /// D: device file - pub const DEVICE: Self = Self(1 << 26); - /// p: named pipe (FIFO) - pub const NAMED_PIPE: Self = Self(1 << 25); - /// S: Unix domain socket - pub const SOCKET: Self = Self(1 << 24); - /// u: setuid - pub const SETUID: Self = Self(1 << 23); - /// g: setgid - pub const SETGID: Self = Self(1 << 22); - /// c: Unix character device, when DEVICE is set - pub const CHAR_DEVICE: Self = Self(1 << 21); - /// t: sticky - pub const STICKY: Self = Self(1 << 20); - /// ?: non-regular file; nothing else is known - pub const IRREGULAR: Self = Self(1 << 19); -} - -impl fmt::Display for Mode { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let mut w = 0; - if self.has(Self::DIR) { - write!(f, "d")?; - w += 1; - } - if self.has(Self::APPEND) { - write!(f, "a")?; - w += 1; - } - if self.has(Self::EXCLUSIVE) { - write!(f, "l")?; - w += 1; - } - if self.has(Self::TEMPORARY) { - write!(f, "T")?; - w += 1; - } - if self.has(Self::SYMLINK) { - write!(f, "L")?; - w += 1; - } - if self.has(Self::DEVICE) { - write!(f, "D")?; - w += 1; - } - if self.has(Self::NAMED_PIPE) { - write!(f, "p")?; - w += 1; - } - if self.has(Self::SOCKET) { - write!(f, "S")?; - w += 1; - } - if self.has(Self::SETUID) { - write!(f, "u")?; - w += 1; - } - if self.has(Self::SETGID) { - write!(f, "g")?; - w += 1; - } - if self.has(Self::CHAR_DEVICE) { - write!(f, "c")?; - w += 1; - } - if self.has(Self::STICKY) { - write!(f, "t")?; - w += 1; - } - if self.has(Self::IRREGULAR) { - write!(f, "?")?; - w += 1; - } - if w == 0 { - write!(f, "-")?; - } - - let rwx = "rwxrwxrwx"; - for (i, c) in rwx.char_indices() { - if self.has(Mode(1 << (9 - 1 - i))) { - write!(f, "{}", c)?; - } else { - write!(f, "-")?; - } - } - - Ok(()) - } -} - -impl From for Mode { - fn from(m: UnixMode) -> Self { - let mut mode = Mode(m.0 & 0o777); - - match m & UnixMode::IFMT { - UnixMode::IFBLK => mode |= Mode::DEVICE, - UnixMode::IFCHR => mode |= Mode::DEVICE & Mode::CHAR_DEVICE, - UnixMode::IFDIR => mode |= Mode::DIR, - UnixMode::IFIFO => mode |= Mode::NAMED_PIPE, - UnixMode::IFLNK => mode |= Mode::SYMLINK, - UnixMode::IFREG => { /* nothing to do */ } - UnixMode::IFSOCK => mode |= Mode::SOCKET, - _ => {} - } - - if m.has(UnixMode::ISGID) { - mode |= Mode::SETGID - } - if m.has(UnixMode::ISUID) { - mode |= Mode::SETUID - } - if m.has(UnixMode::ISVTX) { - mode |= Mode::STICKY - } - - mode - } -} - -impl From for Mode { - fn from(m: MsdosMode) -> Self { - let mut mode = if m.has(MsdosMode::DIR) { - Mode::DIR | Mode(0o777) - } else { - Mode(0o666) - }; - if m.has(MsdosMode::READ_ONLY) { - mode &= Mode(0o222); - } - - mode - } -} - -impl From for Mode { - fn from(u: u32) -> Self { - Mode(u) - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct UnixMode(pub u32); - -impl UnixMode { - pub const IFMT: Self = Self(0xf000); - pub const IFSOCK: Self = Self(0xc000); - pub const IFLNK: Self = Self(0xa000); - pub const IFREG: Self = Self(0x8000); - pub const IFBLK: Self = Self(0x6000); - pub const IFDIR: Self = Self(0x4000); - pub const IFCHR: Self = Self(0x2000); - pub const IFIFO: Self = Self(0x1000); - pub const ISUID: Self = Self(0x800); - pub const ISGID: Self = Self(0x400); - pub const ISVTX: Self = Self(0x200); -} - -impl From for UnixMode { - fn from(u: u32) -> Self { - UnixMode(u) - } -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] -pub struct MsdosMode(pub u32); - -impl MsdosMode { - pub const DIR: Self = Self(0x10); - pub const READ_ONLY: Self = Self(0x01); -} - -impl From for MsdosMode { - fn from(u: u32) -> Self { - MsdosMode(u) - } -} - -macro_rules! derive_bitops { - ($T: ty) => { - impl std::ops::BitOr for $T { - type Output = Self; - - fn bitor(self, rhs: Self) -> Self { - Self(self.0 | rhs.0) - } - } - - impl std::ops::BitOrAssign for $T { - fn bitor_assign(&mut self, rhs: Self) { - self.0 |= rhs.0; - } - } - - impl std::ops::BitAnd for $T { - type Output = Self; - - fn bitand(self, rhs: Self) -> Self { - Self(self.0 & rhs.0) - } - } - - impl std::ops::BitAndAssign for $T { - fn bitand_assign(&mut self, rhs: Self) { - self.0 &= rhs.0; - } - } - - impl $T { - pub fn has(&self, rhs: Self) -> bool { - self.0 & rhs.0 != 0 - } - } - }; -} - -derive_bitops!(Mode); -derive_bitops!(UnixMode); -derive_bitops!(MsdosMode); diff --git a/rc-zip/src/format/raw.rs b/rc-zip/src/format/raw.rs deleted file mode 100644 index fb978ab..0000000 --- a/rc-zip/src/format/raw.rs +++ /dev/null @@ -1,77 +0,0 @@ -use pretty_hex::PrettyHex; -use std::fmt; -use winnow::{stream::ToUsize, token::take, PResult, Parser, Partial}; - -/// A raw zip string, with no specific encoding. -/// -/// This is used while parsing a zip archive's central directory, -/// before we know what encoding is used. -#[derive(Clone)] -pub struct ZipString(pub Vec); - -impl<'a> From<&'a [u8]> for ZipString { - fn from(slice: &'a [u8]) -> Self { - Self(slice.into()) - } -} - -impl fmt::Debug for ZipString { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match std::str::from_utf8(&self.0) { - Ok(s) => write!(f, "{:?}", s), - Err(_) => write!(f, "[non-utf8 string: {}]", self.0.hex_dump()), - } - } -} - -impl ZipString { - pub(crate) fn parser(count: C) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult - where - C: ToUsize, - { - let count = count.to_usize(); - move |i| (take(count).map(|slice: &[u8]| Self(slice.into()))).parse_next(i) - } - - pub(crate) fn into_option(self) -> Option { - if !self.0.is_empty() { - Some(self) - } else { - None - } - } -} - -/// A raw u8 slice, with no specific structure. -/// -/// This is used while parsing a zip archive, when we want -/// to retain an owned slice to be parsed later. -#[derive(Clone)] -pub struct ZipBytes(pub Vec); - -impl fmt::Debug for ZipBytes { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - const MAX_SHOWN_SIZE: usize = 10; - let data = &self.0[..]; - let (slice, extra) = if data.len() > MAX_SHOWN_SIZE { - (&self.0[..MAX_SHOWN_SIZE], Some(data.len() - MAX_SHOWN_SIZE)) - } else { - (&self.0[..], None) - }; - write!(f, "{}", slice.hex_dump())?; - if let Some(extra) = extra { - write!(f, " (+ {} bytes)", extra)?; - } - Ok(()) - } -} - -impl ZipBytes { - pub(crate) fn parser(count: C) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult - where - C: ToUsize, - { - let count = count.to_usize(); - move |i| (take(count).map(|slice: &[u8]| Self(slice.into()))).parse_next(i) - } -} diff --git a/rc-zip/src/format/version.rs b/rc-zip/src/format/version.rs deleted file mode 100644 index 1b9ac8f..0000000 --- a/rc-zip/src/format/version.rs +++ /dev/null @@ -1,133 +0,0 @@ -use std::fmt; -use winnow::{binary::le_u16, PResult, Parser, Partial}; - -/// A zip version (either created by, or required when reading an archive). -/// -/// Versions determine which features are supported by a tool, and -/// which features are required when reading a file. -/// -/// For more information, see the [.ZIP Application Note](https://support.pkware.com/display/PKZIP/APPNOTE), section 4.4.2. -#[derive(Clone, Copy, PartialEq, Eq, Hash)] -pub struct Version(pub u16); - -impl fmt::Debug for Version { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "{:?} v{}.{}", - self.host_system(), - self.major(), - self.minor() - ) - } -} - -impl Version { - /// Parse a version from a byte slice - pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - le_u16.map(Self).parse_next(i) - } - - /// Identifies the host system on which the zip attributes are compatible. - pub fn host_system(&self) -> HostSystem { - match self.host() { - 0 => HostSystem::MsDos, - 1 => HostSystem::Amiga, - 2 => HostSystem::OpenVms, - 3 => HostSystem::Unix, - 4 => HostSystem::VmCms, - 5 => HostSystem::AtariSt, - 6 => HostSystem::Os2Hpfs, - 7 => HostSystem::Macintosh, - 8 => HostSystem::ZSystem, - 9 => HostSystem::CpM, - 10 => HostSystem::WindowsNtfs, - 11 => HostSystem::Mvs, - 12 => HostSystem::Vse, - 13 => HostSystem::AcornRisc, - 14 => HostSystem::Vfat, - 15 => HostSystem::AlternateMvs, - 16 => HostSystem::BeOs, - 17 => HostSystem::Tandem, - 18 => HostSystem::Os400, - 19 => HostSystem::Osx, - n => HostSystem::Unknown(n), - } - } - - /// Integer host system - pub fn host(&self) -> u8 { - (self.0 >> 8) as u8 - } - - /// Integer version, e.g. 45 for Zip version 4.5 - pub fn version(&self) -> u8 { - (self.0 & 0xff) as u8 - } - - /// ZIP specification major version - /// - /// See APPNOTE, section 4.4.2.1 - pub fn major(&self) -> u32 { - self.version() as u32 / 10 - } - - /// ZIP specification minor version - /// - /// See APPNOTE, section 4.4.2.1 - pub fn minor(&self) -> u32 { - self.version() as u32 % 10 - } -} - -/// System on which an archive was created, as encoded into a version u16. -/// -/// See APPNOTE, section 4.4.2.2 -#[derive(Debug)] -pub enum HostSystem { - /// MS-DOS and OS/2 (FAT / VFAT / FAT32 file systems) - MsDos, - /// Amiga - Amiga, - /// OpenVMS - OpenVms, - /// UNIX - Unix, - /// VM/CMS - VmCms, - /// Atari ST - AtariSt, - /// OS/2 H.P.F.S - Os2Hpfs, - /// Macintosh (see `Osx`) - Macintosh, - /// Z-System - ZSystem, - /// CP/M - CpM, - /// Windows NTFS - WindowsNtfs, - /// MVS (OS/390 - Z/OS) - Mvs, - /// VSE - Vse, - /// Acorn Risc - AcornRisc, - /// VFAT - Vfat, - /// alternate MVS - AlternateMvs, - /// BeOS - BeOs, - /// Tandem - Tandem, - /// OS/400 - Os400, - /// OS X (Darwin) - Osx, - /// Unknown host system - /// - /// Values 20 through 255 are currently unused, as of - /// APPNOTE.TXT 6.3.6 (April 26, 2019) - Unknown(u8), -} diff --git a/rc-zip/src/fsm/archive.rs b/rc-zip/src/fsm/archive.rs index 6641d59..68c8b1f 100644 --- a/rc-zip/src/fsm/archive.rs +++ b/rc-zip/src/fsm/archive.rs @@ -3,11 +3,12 @@ use crate::{ encoding::Encoding, error::{Error, FormatError}, parse::{ - Archive, DirectoryHeader, EndOfCentralDirectory, EndOfCentralDirectory64Locator, - EndOfCentralDirectory64Record, EndOfCentralDirectoryRecord, Located, StoredEntry, + Archive, CentralDirectoryFileHeader, EndOfCentralDirectory, EndOfCentralDirectory64Locator, + EndOfCentralDirectory64Record, EndOfCentralDirectoryRecord, Entry, Located, }, }; +use ownable::traits::IntoOwned; use tracing::trace; use winnow::{ error::ErrMode, @@ -54,19 +55,19 @@ enum State { /// Reading the zip64 end of central directory record. ReadEocd64Locator { - eocdr: Located, + eocdr: Located>, }, /// Reading the zip64 end of central directory record. ReadEocd64 { eocdr64_offset: u64, - eocdr: Located, + eocdr: Located>, }, /// Reading all headers from the central directory ReadCentralDirectory { - eocd: EndOfCentralDirectory, - directory_headers: Vec, + eocd: EndOfCentralDirectory<'static>, + directory_headers: Vec>, }, #[default] @@ -140,12 +141,13 @@ impl ArchiveFsm { EndOfCentralDirectoryRecord::find_in_block(haystack) } { None => Err(FormatError::DirectoryEndSignatureNotFound.into()), - Some(mut eocdr) => { + Some(eocdr) => { trace!( ?eocdr, size = self.size, "ReadEocd | found end of central directory record" ); + let mut eocdr = eocdr.into_owned(); self.buffer.reset(); eocdr.offset += self.size - haystack_size; @@ -249,6 +251,7 @@ impl ArchiveFsm { "ReadCentralDirectory | process(), available: {}", self.buffer.available_data() ); + let mut valid_consumed = 0; let mut input = Partial::new(self.buffer.data()); trace!( initial_offset = input.as_bytes().offset_from(&self.buffer.data()), @@ -256,7 +259,7 @@ impl ArchiveFsm { "initial offset & len" ); 'read_headers: while !input.is_empty() { - match DirectoryHeader::parser.parse_next(&mut input) { + match CentralDirectoryFileHeader::parser.parse_next(&mut input) { Ok(dh) => { trace!( input_empty_now = input.is_empty(), @@ -264,14 +267,15 @@ impl ArchiveFsm { len = input.len(), "ReadCentralDirectory | parsed directory header" ); - directory_headers.push(dh); + valid_consumed = input.as_bytes().offset_from(&self.buffer.data()); + directory_headers.push(dh.into_owned()); } Err(ErrMode::Incomplete(_needed)) => { // need more data to read the full header trace!("ReadCentralDirectory | incomplete!"); break 'read_headers; } - Err(ErrMode::Backtrack(_err)) | Err(ErrMode::Cut(_err)) => { + Err(ErrMode::Backtrack(err)) | Err(ErrMode::Cut(err)) => { // this is the normal end condition when reading // the central directory (due to 65536-entries non-zip64 files) // let's just check a few numbers first. @@ -280,92 +284,95 @@ impl ArchiveFsm { let expected_records = directory_headers.len() as u16; let actual_records = eocd.directory_records() as u16; - if expected_records == actual_records { - let mut detectorng = chardetng::EncodingDetector::new(); - let mut all_utf8 = true; - let mut had_suspicious_chars_for_cp437 = false; + if expected_records != actual_records { + tracing::trace!( + "error while reading central records: we read {} records, but EOCD announced {}. the last failed with: {err:?} (display: {err}). at that point, input had length {}", + expected_records, + actual_records, + input.len() + ); - { - let max_feed: usize = 4096; - let mut total_fed: usize = 0; - let mut feed = |slice: &[u8]| { - detectorng.feed(slice, false); - for b in slice { - if (0xB0..=0xDF).contains(b) { - // those are, like, box drawing characters - had_suspicious_chars_for_cp437 = true; - } + // if we read the wrong number of directory entries, + // error out. + return Err(FormatError::InvalidCentralRecord { + expected: expected_records, + actual: actual_records, + } + .into()); + } + + let mut detectorng = chardetng::EncodingDetector::new(); + let mut all_utf8 = true; + let mut had_suspicious_chars_for_cp437 = false; + + { + let max_feed: usize = 4096; + let mut total_fed: usize = 0; + let mut feed = |slice: &[u8]| { + detectorng.feed(slice, false); + for b in slice { + if (0xB0..=0xDF).contains(b) { + // those are, like, box drawing characters + had_suspicious_chars_for_cp437 = true; } + } - total_fed += slice.len(); - total_fed < max_feed - }; + total_fed += slice.len(); + total_fed < max_feed + }; - 'recognize_encoding: for fh in - directory_headers.iter().filter(|fh| fh.is_non_utf8()) - { - all_utf8 = false; - if !feed(&fh.name.0) || !feed(&fh.comment.0) { - break 'recognize_encoding; - } + 'recognize_encoding: for fh in + directory_headers.iter().filter(|fh| fh.is_non_utf8()) + { + all_utf8 = false; + if !feed(&fh.name[..]) || !feed(&fh.comment[..]) { + break 'recognize_encoding; } } + } - let encoding = { - if all_utf8 { - Encoding::Utf8 - } else { - let encoding = detectorng.guess(None, true); - if encoding == encoding_rs::SHIFT_JIS { - // well hold on, sometimes Codepage 437 is detected as - // Shift-JIS by chardetng. If we have any characters - // that aren't valid DOS file names, then okay it's probably - // Shift-JIS. Otherwise, assume it's CP437. - if had_suspicious_chars_for_cp437 { - Encoding::ShiftJis - } else { - Encoding::Cp437 - } - } else if encoding == encoding_rs::UTF_8 { - Encoding::Utf8 + let encoding = { + if all_utf8 { + Encoding::Utf8 + } else { + let encoding = detectorng.guess(None, true); + if encoding == encoding_rs::SHIFT_JIS { + // well hold on, sometimes Codepage 437 is detected as + // Shift-JIS by chardetng. If we have any characters + // that aren't valid DOS file names, then okay it's probably + // Shift-JIS. Otherwise, assume it's CP437. + if had_suspicious_chars_for_cp437 { + Encoding::ShiftJis } else { Encoding::Cp437 } + } else if encoding == encoding_rs::UTF_8 { + Encoding::Utf8 + } else { + Encoding::Cp437 } - }; - - let is_zip64 = eocd.dir64.is_some(); - let global_offset = eocd.global_offset as u64; - let entries: Result, Error> = directory_headers - .iter() - .map(|x| x.as_stored_entry(is_zip64, encoding, global_offset)) - .collect(); - let entries = entries?; - - let mut comment: Option = None; - if !eocd.comment().0.is_empty() { - comment = Some(encoding.decode(&eocd.comment().0)?); } - - return Ok(FsmResult::Done(Archive { - size: self.size, - comment, - entries, - encoding, - })); - } else { - // if we read the wrong number of directory entries, - // error out. - return Err(FormatError::InvalidCentralRecord { - expected: expected_records, - actual: actual_records, - } - .into()); - } + }; + + let global_offset = eocd.global_offset as u64; + let entries: Result, Error> = directory_headers + .iter() + .map(|x| x.as_entry(encoding, global_offset)) + .collect(); + let entries = entries?; + + let comment = encoding.decode(eocd.comment())?; + + return Ok(FsmResult::Done(Archive { + size: self.size, + comment, + entries, + encoding, + })); } } } - let consumed = input.as_bytes().offset_from(&self.buffer.data()); + let consumed = valid_consumed; tracing::trace!(%consumed, "ReadCentralDirectory total consumed"); self.buffer.consume(consumed); diff --git a/rc-zip/src/fsm/entry/lzma_dec.rs b/rc-zip/src/fsm/entry/lzma_dec.rs index 0b98890..346e041 100644 --- a/rc-zip/src/fsm/entry/lzma_dec.rs +++ b/rc-zip/src/fsm/entry/lzma_dec.rs @@ -21,10 +21,10 @@ pub(crate) struct LzmaDec { } impl LzmaDec { - pub fn new(uncompressed_size: u64) -> Self { + pub fn new(uncompressed_size: Option) -> Self { let stream = Stream::new_with_options( &(Options { - unpacked_size: UnpackedSize::UseProvided(Some(uncompressed_size)), + unpacked_size: UnpackedSize::UseProvided(uncompressed_size), allow_incomplete: false, memlimit: Some(128 * 1024 * 1024), }), diff --git a/rc-zip/src/fsm/entry/mod.rs b/rc-zip/src/fsm/entry/mod.rs index db72965..b46f42f 100644 --- a/rc-zip/src/fsm/entry/mod.rs +++ b/rc-zip/src/fsm/entry/mod.rs @@ -27,7 +27,7 @@ mod zstd_dec; use crate::{ error::{Error, FormatError, UnsupportedError}, - parse::{DataDescriptorRecord, LocalFileHeaderRecord, Method, StoredEntryInner}, + parse::{DataDescriptorRecord, Entry, LocalFileHeader, Method}, }; use super::FsmResult; @@ -42,8 +42,11 @@ enum State { ReadLocalHeader, ReadData { - /// The local file header for this entry - header: LocalFileHeaderRecord, + /// Whether the entry has a data descriptor + has_data_descriptor: bool, + + /// Whether the entry is zip64 (because its compressed size or uncompressed size is u32::MAX) + is_zip64: bool, /// Amount of bytes we've fed to the decompressor compressed_bytes: u64, @@ -59,17 +62,14 @@ enum State { }, ReadDataDescriptor { - /// The local file header for this entry - header: LocalFileHeaderRecord, + /// Whether the entry is zip64 (because its compressed size or uncompressed size is u32::MAX) + is_zip64: bool, /// Size we've decompressed + crc32 hash we've computed metrics: EntryReadMetrics, }, Validate { - /// The local file header for this entry - header: LocalFileHeaderRecord, - /// Size we've decompressed + crc32 hash we've computed metrics: EntryReadMetrics, @@ -84,21 +84,25 @@ enum State { /// A state machine that can parse a zip entry pub struct EntryFsm { state: State, - entry: StoredEntryInner, - method: Method, + entry: Option, buffer: Buffer, - eof: bool, } impl EntryFsm { /// Create a new state machine for decompressing a zip entry - pub fn new(method: Method, entry: StoredEntryInner) -> Self { + pub fn new(entry: Option, buffer: Option) -> Self { + const BUF_CAPACITY: usize = 256 * 1024; + Self { state: State::ReadLocalHeader, entry, - method, - buffer: Buffer::with_capacity(256 * 1024), - eof: false, + buffer: match buffer { + Some(buffer) => { + assert!(buffer.capacity() >= BUF_CAPACITY, "buffer too small"); + buffer + } + None => Buffer::with_capacity(BUF_CAPACITY), + }, } } @@ -118,6 +122,60 @@ impl EntryFsm { } } + /// Like `process`, but only processes the header. If this returns + /// `Ok(None)`, the caller should read more data and call this function + /// again. + pub fn process_till_header(&mut self) -> Result, Error> { + match &self.state { + State::ReadLocalHeader => { + self.internal_process_local_header()?; + } + _ => { + // already good + } + } + + // this will be non-nil if we've parsed the local header, otherwise, + Ok(self.entry.as_ref()) + } + + fn internal_process_local_header(&mut self) -> Result { + assert!( + matches!(self.state, State::ReadLocalHeader), + "internal_process_local_header called in wrong state", + ); + + let mut input = Partial::new(self.buffer.data()); + match LocalFileHeader::parser.parse_next(&mut input) { + Ok(header) => { + let consumed = input.as_bytes().offset_from(&self.buffer.data()); + tracing::trace!(local_file_header = ?header, consumed, "parsed local file header"); + let decompressor = AnyDecompressor::new( + header.method, + self.entry.as_ref().map(|entry| entry.uncompressed_size), + )?; + + if self.entry.is_none() { + self.entry = Some(header.as_entry()?); + } + + self.state = State::ReadData { + is_zip64: header.compressed_size == u32::MAX + || header.uncompressed_size == u32::MAX, + has_data_descriptor: header.has_data_descriptor(), + compressed_bytes: 0, + uncompressed_bytes: 0, + hasher: crc32fast::Hasher::new(), + decompressor, + }; + self.buffer.consume(consumed); + Ok(true) + } + Err(ErrMode::Incomplete(_)) => Ok(false), + Err(_e) => Err(Error::Format(FormatError::InvalidLocalHeader)), + } + } + /// Process the input and write the output to the given buffer /// /// This function will return `FsmResult::Continue` if it needs more input @@ -131,7 +189,7 @@ impl EntryFsm { pub fn process( mut self, out: &mut [u8], - ) -> Result, Error> { + ) -> Result, Error> { tracing::trace!( state = match &self.state { State::ReadLocalHeader => "ReadLocalHeader", @@ -146,26 +204,8 @@ impl EntryFsm { use State as S; match &mut self.state { S::ReadLocalHeader => { - let mut input = Partial::new(self.buffer.data()); - match LocalFileHeaderRecord::parser.parse_next(&mut input) { - Ok(header) => { - let consumed = input.as_bytes().offset_from(&self.buffer.data()); - tracing::trace!(local_file_header = ?header, consumed, "parsed local file header"); - self.buffer.consume(consumed); - self.state = S::ReadData { - header, - compressed_bytes: 0, - uncompressed_bytes: 0, - hasher: crc32fast::Hasher::new(), - decompressor: AnyDecompressor::new(self.method, &self.entry)?, - }; - self.process(out) - } - Err(ErrMode::Incomplete(_)) => { - Ok(FsmResult::Continue((self, Default::default()))) - } - Err(_e) => Err(Error::Format(FormatError::InvalidLocalHeader)), - } + self.internal_process_local_header()?; + self.process(out) } S::ReadData { compressed_bytes, @@ -177,42 +217,56 @@ impl EntryFsm { let in_buf = self.buffer.data(); // don't feed the decompressor bytes beyond the entry's compressed size + + let entry = self.entry.as_ref().unwrap(); let in_buf_max_len = cmp::min( in_buf.len(), - self.entry.compressed_size as usize - *compressed_bytes as usize, + entry.compressed_size as usize - *compressed_bytes as usize, ); let in_buf = &in_buf[..in_buf_max_len]; let fed_bytes_after_this = *compressed_bytes + in_buf.len() as u64; - - let has_more_input = if fed_bytes_after_this == self.entry.compressed_size as _ { + let has_more_input = if fed_bytes_after_this == entry.compressed_size as _ { HasMoreInput::No } else { HasMoreInput::Yes }; + + trace!( + compressed_bytes = *compressed_bytes, + uncompressed_bytes = *uncompressed_bytes, + fed_bytes_after_this, + in_buf_len = in_buf.len(), + ?has_more_input, + "decompressing" + ); + let outcome = decompressor.decompress(in_buf, out, has_more_input)?; trace!( ?outcome, compressed_bytes = *compressed_bytes, uncompressed_bytes = *uncompressed_bytes, - eof = self.eof, "decompressed" ); self.buffer.consume(outcome.bytes_read); *compressed_bytes += outcome.bytes_read as u64; - if outcome.bytes_written == 0 && self.eof { + if outcome.bytes_written == 0 && *compressed_bytes == entry.compressed_size { + trace!("eof and no bytes written, we're done"); + // we're done, let's read the data descriptor (if there's one) - transition!(self.state => (S::ReadData { header, uncompressed_bytes, hasher, .. }) { + transition!(self.state => (S::ReadData { has_data_descriptor, is_zip64, uncompressed_bytes, hasher, .. }) { let metrics = EntryReadMetrics { uncompressed_size: uncompressed_bytes, crc32: hasher.finalize(), }; - if header.has_data_descriptor() { - S::ReadDataDescriptor { header, metrics } + if has_data_descriptor { + trace!("transitioning to ReadDataDescriptor"); + S::ReadDataDescriptor { metrics, is_zip64 } } else { - S::Validate { header, metrics, descriptor: None } + trace!("transitioning to Validate"); + S::Validate { metrics, descriptor: None } } }); return self.process(out); @@ -231,15 +285,16 @@ impl EntryFsm { Ok(FsmResult::Continue((self, outcome))) } - S::ReadDataDescriptor { .. } => { + S::ReadDataDescriptor { is_zip64, .. } => { let mut input = Partial::new(self.buffer.data()); - match DataDescriptorRecord::mk_parser(self.entry.is_zip64).parse_next(&mut input) { + + match DataDescriptorRecord::mk_parser(*is_zip64).parse_next(&mut input) { Ok(descriptor) => { self.buffer .consume(input.as_bytes().offset_from(&self.buffer.data())); trace!("data descriptor = {:#?}", descriptor); - transition!(self.state => (S::ReadDataDescriptor { metrics, header, .. }) { - S::Validate { metrics, header, descriptor: Some(descriptor) } + transition!(self.state => (S::ReadDataDescriptor { metrics, .. }) { + S::Validate { metrics, descriptor: Some(descriptor) } }); self.process(out) } @@ -250,29 +305,22 @@ impl EntryFsm { } } S::Validate { - header, metrics, descriptor, } => { - let expected_crc32 = if self.entry.crc32 != 0 { - self.entry.crc32 - } else if let Some(descriptor) = descriptor.as_ref() { - descriptor.crc32 - } else { - header.crc32 - }; + let entry = self.entry.as_ref().unwrap(); - let expected_size = if self.entry.uncompressed_size != 0 { - self.entry.uncompressed_size + let expected_crc32 = if entry.crc32 != 0 { + entry.crc32 } else if let Some(descriptor) = descriptor.as_ref() { - descriptor.uncompressed_size + descriptor.crc32 } else { - header.uncompressed_size as u64 + 0 }; - if expected_size != metrics.uncompressed_size { + if entry.uncompressed_size != metrics.uncompressed_size { return Err(Error::Format(FormatError::WrongSize { - expected: expected_size, + expected: entry.uncompressed_size, actual: metrics.uncompressed_size, })); } @@ -284,7 +332,7 @@ impl EntryFsm { })); } - Ok(FsmResult::Done(())) + Ok(FsmResult::Done(self.buffer)) } S::Transition => { unreachable!("the state machine should never be in the transition state") @@ -305,13 +353,8 @@ impl EntryFsm { /// After having written data to [Self::space], call this to indicate how /// many bytes were written. - /// - /// If this is called with zero, it indicates eof #[inline] pub fn fill(&mut self, count: usize) -> usize { - if count == 0 { - self.eof = true; - } self.buffer.fill(count) } } @@ -339,6 +382,8 @@ pub struct DecompressOutcome { pub bytes_written: usize, } +/// Returns whether there's more input to be fed to the decompressor +#[derive(Debug)] pub enum HasMoreInput { Yes, No, @@ -354,7 +399,7 @@ trait Decompressor { } impl AnyDecompressor { - fn new(method: Method, #[allow(unused)] entry: &StoredEntryInner) -> Result { + fn new(method: Method, #[allow(unused)] uncompressed_size: Option) -> Result { let dec = match method { Method::Store => Self::Store(Default::default()), @@ -383,7 +428,7 @@ impl AnyDecompressor { } #[cfg(feature = "lzma")] - Method::Lzma => Self::Lzma(Box::new(lzma_dec::LzmaDec::new(entry.uncompressed_size))), + Method::Lzma => Self::Lzma(Box::new(lzma_dec::LzmaDec::new(uncompressed_size))), #[cfg(not(feature = "lzma"))] Method::Lzma => { let err = Error::Unsupported(UnsupportedError::MethodNotEnabled(method)); diff --git a/rc-zip/src/lib.rs b/rc-zip/src/lib.rs index 50fda8c..adadb93 100644 --- a/rc-zip/src/lib.rs +++ b/rc-zip/src/lib.rs @@ -19,3 +19,6 @@ pub mod parse; #[cfg(any(test, feature = "corpus"))] pub mod corpus; + +// dependencies re-exports +pub use chrono; diff --git a/rc-zip/src/parse/archive.rs b/rc-zip/src/parse/archive.rs index 4b464eb..009e43e 100644 --- a/rc-zip/src/parse/archive.rs +++ b/rc-zip/src/parse/archive.rs @@ -1,13 +1,17 @@ -use chrono::{DateTime, Utc}; +use chrono::{offset::Utc, DateTime, TimeZone}; use num_enum::{FromPrimitive, IntoPrimitive}; +use ownable::{IntoOwned, ToOwned}; +use winnow::{binary::le_u16, PResult, Partial}; use crate::{ encoding::Encoding, - parse::{ExtraField, Mode, Version}, + parse::{Mode, Version}, }; +use super::{zero_datetime, ExtraField, NtfsAttr}; + /// An Archive contains general information about a zip files, along with a list -/// of [entries][StoredEntry]. +/// of [entries][Entry]. /// /// It is obtained through a state machine like /// [ArchiveFsm](crate::fsm::ArchiveFsm), although end-users tend to use @@ -17,84 +21,81 @@ use crate::{ pub struct Archive { pub(crate) size: u64, pub(crate) encoding: Encoding, - pub(crate) entries: Vec, - pub(crate) comment: Option, + pub(crate) entries: Vec, + pub(crate) comment: String, } impl Archive { /// The size of .zip file that was read, in bytes. + #[inline(always)] pub fn size(&self) -> u64 { self.size } /// Iterate over all files in this zip, read from the central directory. - pub fn entries(&self) -> impl Iterator { + pub fn entries(&self) -> impl Iterator { self.entries.iter() } /// Attempts to look up an entry by name. This is usually a bad idea, /// as names aren't necessarily normalized in zip archives. - pub fn by_name>(&self, name: N) -> Option<&StoredEntry> { - self.entries.iter().find(|&x| x.name() == name.as_ref()) + pub fn by_name>(&self, name: N) -> Option<&Entry> { + self.entries.iter().find(|&x| x.name == name.as_ref()) } /// Returns the detected character encoding for text fields /// (names, comments) inside this zip archive. + #[inline(always)] pub fn encoding(&self) -> Encoding { self.encoding } /// Returns the comment for this archive, if any. When reading /// a zip file with an empty comment field, this will return None. - pub fn comment(&self) -> Option<&String> { - self.comment.as_ref() + #[inline(always)] + pub fn comment(&self) -> &str { + &self.comment } } /// Describes a zip archive entry (a file, a directory, a symlink) -/// -/// `Entry` contains normalized metadata fields, that can be set when -/// writing a zip archive. Additional metadata, along with the information -/// required to extract an entry, are available in [StoredEntry][] instead. #[derive(Clone)] pub struct Entry { /// Name of the file - /// Must be a relative path, not start with a drive letter (e.g. C:), - /// and must use forward slashes instead of back slashes + /// + /// This should be a relative path, separated by `/`. However, there are zip + /// files in the wild with all sorts of evil variants, so, be conservative + /// in what you accept. + /// + /// See also [Self::sanitized_name], which returns a sanitized version of + /// the name, working around zip slip vulnerabilities. pub name: String, - /// Compression method - /// - /// See [Method][] for more details. + /// Compression method: Store, Deflate, Bzip2, etc. pub method: Method, /// Comment is any arbitrary user-defined string shorter than 64KiB - pub comment: Option, - - /// Modified timestamp - pub modified: chrono::DateTime, + pub comment: String, - /// Created timestamp - pub created: Option>, + /// This entry's "last modified" timestamp - with caveats + /// + /// Due to the history of the ZIP file format, this may be inaccurate. It may be offset + /// by a few hours, if there is no extended timestamp information. It may have a resolution + /// as low as two seconds, if only MSDOS timestamps are present. It may default to the Unix + /// epoch, if something went really wrong. + /// + /// If you're reading this after the year 2038, or after the year 2108, godspeed. + pub modified: DateTime, - /// Accessed timestamp - pub accessed: Option>, -} + /// This entry's "created" timestamp, if available. + /// + /// See [Self::modified] for caveats. + pub created: Option>, -/// An entry as stored into an Archive. Contains additional metadata and offset information. -/// -/// Whereas [Entry][] is archive-independent, [StoredEntry][] contains information that is tied to -/// a specific archive. -/// -/// When reading archives, one deals with a list of [StoredEntry][], whereas when writing one, one -/// typically only specifies an [Entry][] and provides the entry's contents: fields like the CRC32 -/// hash, uncompressed size, and compressed size are derived automatically from the input. -#[derive(Clone)] -pub struct StoredEntry { - /// Archive-independent information + /// This entry's "last accessed" timestamp, if available. /// - /// This contains the entry's name, timestamps, comment, compression method. - pub entry: Entry, + /// See [Self::accessed] for caveats. + pub accessed: Option>, /// Offset of the local file header in the zip file /// @@ -111,12 +112,6 @@ pub struct StoredEntry { /// ``` pub header_offset: u64, - /// External attributes (zip) - pub external_attrs: u32, - - /// Version of zip supported by the tool that crated this archive. - pub creator_version: Version, - /// Version of zip needed to extract this archive. pub reader_version: Version, @@ -139,24 +134,6 @@ pub struct StoredEntry { /// Only present if a Unix extra field or New Unix extra field was found. pub gid: Option, - /// File mode - pub mode: Mode, - - /// Any extra fields recognized while parsing the file. - /// - /// Most of these should be normalized and accessible as other fields, - /// but they are also made available here raw. - pub extra_fields: Vec, - - /// These fields are cheap to clone and needed for entry readers, - /// hence them being in a separate struct - pub inner: StoredEntryInner, -} - -/// Fields required to read an entry properly, typically cloned into owned entry -/// readers. -#[derive(Clone, Copy, Debug)] -pub struct StoredEntryInner { /// CRC-32 hash as found in the central directory. /// /// Note that this may be zero, and the actual CRC32 might be in the local header, or (more @@ -171,22 +148,11 @@ pub struct StoredEntryInner { /// This will be zero for directories. pub uncompressed_size: u64, - /// True if this entry was read from a zip64 archive - pub is_zip64: bool, + /// File mode. + pub mode: Mode, } -impl StoredEntry { - /// Returns the entry's name. See also - /// [sanitized_name()](StoredEntry::sanitized_name), which returns a - /// sanitized version of the name. - /// - /// This should be a relative path, separated by `/`. However, there are zip - /// files in the wild with all sorts of evil variants, so, be conservative - /// in what you accept. - pub fn name(&self) -> &str { - self.entry.name.as_ref() - } - +impl Entry { /// Returns a sanitized version of the entry's name, if it /// seems safe. In particular, if this method feels like the /// entry name is trying to do a zip slip (cf. @@ -195,7 +161,7 @@ impl StoredEntry { /// /// Other than that, it will strip any leading slashes on non-Windows OSes. pub fn sanitized_name(&self) -> Option<&str> { - let name = self.name(); + let name = self.name.as_str(); // refuse entries with traversed/absolute path to mitigate zip slip if name.contains("..") { @@ -223,52 +189,56 @@ impl StoredEntry { } } - /// The entry's comment, if any. - /// - /// When reading a zip file, an empty comment results in None. - pub fn comment(&self) -> Option<&str> { - self.entry.comment.as_ref().map(|x| x.as_ref()) - } - - /// The compression method used for this entry - #[inline(always)] - pub fn method(&self) -> Method { - self.entry.method - } - - /// This entry's "last modified" timestamp - with caveats - /// - /// Due to the history of the ZIP file format, this may be inaccurate. It may be offset - /// by a few hours, if there is no extended timestamp information. It may have a resolution - /// as low as two seconds, if only MSDOS timestamps are present. It may default to the Unix - /// epoch, if something went really wrong. - /// - /// If you're reading this after the year 2038, or after the year 2108, godspeed. - #[inline(always)] - pub fn modified(&self) -> DateTime { - self.entry.modified - } - - /// This entry's "created" timestamp, if available. - /// - /// See [StoredEntry::modified()] for caveats. - #[inline(always)] - pub fn created(&self) -> Option<&DateTime> { - self.entry.created.as_ref() - } - - /// This entry's "last accessed" timestamp, if available. - /// - /// See [StoredEntry::modified()] for caveats. - #[inline(always)] - pub fn accessed(&self) -> Option<&DateTime> { - self.entry.accessed.as_ref() + /// Apply the extra field to the entry, updating its metadata. + pub(crate) fn set_extra_field(&mut self, ef: &ExtraField) { + match &ef { + ExtraField::Zip64(z64) => { + self.uncompressed_size = z64.uncompressed_size; + self.compressed_size = z64.compressed_size; + self.header_offset = z64.header_offset; + } + ExtraField::Timestamp(ts) => { + self.modified = Utc + .timestamp_opt(ts.mtime as i64, 0) + .single() + .unwrap_or_else(zero_datetime); + } + ExtraField::Ntfs(nf) => { + for attr in &nf.attrs { + // note: other attributes are unsupported + if let NtfsAttr::Attr1(attr) = attr { + self.modified = attr.mtime.to_datetime().unwrap_or_else(zero_datetime); + self.created = attr.ctime.to_datetime(); + self.accessed = attr.atime.to_datetime(); + } + } + } + ExtraField::Unix(uf) => { + self.modified = Utc + .timestamp_opt(uf.mtime as i64, 0) + .single() + .unwrap_or_else(zero_datetime); + + if self.uid.is_none() { + self.uid = Some(uf.uid as u32); + } + + if self.gid.is_none() { + self.gid = Some(uf.gid as u32); + } + } + ExtraField::NewUnix(uf) => { + self.uid = Some(uf.uid as u32); + self.gid = Some(uf.uid as u32); + } + _ => {} + }; } } -/// The contents of an entry: a directory, a file, or a symbolic link. +/// The entry's file type: a directory, a file, or a symbolic link. #[derive(Debug)] -pub enum EntryContents { +pub enum EntryKind { /// The entry is a directory Directory, @@ -279,15 +249,15 @@ pub enum EntryContents { Symlink, } -impl StoredEntry { - /// Determine [EntryContents] of this entry based on its mode. - pub fn contents(&self) -> EntryContents { +impl Entry { + /// Determine the kind of this entry based on its mode. + pub fn kind(&self) -> EntryKind { if self.mode.has(Mode::SYMLINK) { - EntryContents::Symlink + EntryKind::Symlink } else if self.mode.has(Mode::DIR) { - EntryContents::Directory + EntryKind::Directory } else { - EntryContents::File + EntryKind::File } } } @@ -299,7 +269,9 @@ impl StoredEntry { /// /// However, in the wild, it is not too uncommon to encounter [Bzip2][Method::Bzip2], /// [Lzma][Method::Lzma] or others. -#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, IntoPrimitive, FromPrimitive)] +#[derive( + Debug, Clone, Copy, PartialEq, Eq, Hash, IntoPrimitive, FromPrimitive, IntoOwned, ToOwned, +)] #[repr(u16)] pub enum Method { /// No compression is applied @@ -342,3 +314,10 @@ pub enum Method { #[num_enum(catch_all)] Unrecognized(u16), } + +impl Method { + /// Parse a method from a byte slice + pub fn parser(i: &mut Partial<&[u8]>) -> PResult { + le_u16(i).map(From::from) + } +} diff --git a/rc-zip/src/parse/central_directory_file_header.rs b/rc-zip/src/parse/central_directory_file_header.rs new file mode 100644 index 0000000..9dfe7a1 --- /dev/null +++ b/rc-zip/src/parse/central_directory_file_header.rs @@ -0,0 +1,195 @@ +use std::borrow::Cow; + +use ownable::{IntoOwned, ToOwned}; +use tracing::trace; +use winnow::{ + binary::{le_u16, le_u32}, + prelude::PResult, + token::{tag, take}, + Parser, Partial, +}; + +use crate::{ + encoding::detect_utf8, + encoding::Encoding, + error::{Error, FormatError}, + parse::{ + zero_datetime, Entry, ExtraField, ExtraFieldSettings, HostSystem, Mode, MsdosMode, + MsdosTimestamp, UnixMode, Version, + }, +}; + +use super::Method; + +/// 4.3.12 Central directory structure: File header +#[derive(IntoOwned, ToOwned)] +pub struct CentralDirectoryFileHeader<'a> { + /// version made by + pub creator_version: Version, + + /// version needed to extract + pub reader_version: Version, + + /// general purpose bit flag + pub flags: u16, + + /// compression method + pub method: Method, + + /// last mod file datetime + pub modified: MsdosTimestamp, + + /// crc32 hash + pub crc32: u32, + + /// compressed size + pub compressed_size: u32, + + /// uncompressed size + pub uncompressed_size: u32, + + /// disk number start + pub disk_nbr_start: u16, + + /// internal file attributes + pub internal_attrs: u16, + + /// external file attributes + pub external_attrs: u32, + + /// relative offset of local header + pub header_offset: u32, + + /// name field + pub name: Cow<'a, [u8]>, + + /// extra field + pub extra: Cow<'a, [u8]>, + + /// comment field + pub comment: Cow<'a, [u8]>, +} + +impl<'a> CentralDirectoryFileHeader<'a> { + const SIGNATURE: &'static str = "PK\x01\x02"; + + /// Parser for the central directory file header + pub fn parser(i: &mut Partial<&'a [u8]>) -> PResult { + _ = tag(Self::SIGNATURE).parse_next(i)?; + let creator_version = Version::parser.parse_next(i)?; + let reader_version = Version::parser.parse_next(i)?; + let flags = le_u16.parse_next(i)?; + let method = Method::parser.parse_next(i)?; + let modified = MsdosTimestamp::parser.parse_next(i)?; + let crc32 = le_u32.parse_next(i)?; + let compressed_size = le_u32.parse_next(i)?; + let uncompressed_size = le_u32.parse_next(i)?; + let name_len = le_u16.parse_next(i)?; + let extra_len = le_u16.parse_next(i)?; + let comment_len = le_u16.parse_next(i)?; + let disk_nbr_start = le_u16.parse_next(i)?; + let internal_attrs = le_u16.parse_next(i)?; + let external_attrs = le_u32.parse_next(i)?; + let header_offset = le_u32.parse_next(i)?; + + let name = take(name_len).parse_next(i)?; + let extra = take(extra_len).parse_next(i)?; + let comment = take(comment_len).parse_next(i)?; + + Ok(Self { + creator_version, + reader_version, + flags, + method, + modified, + crc32, + compressed_size, + uncompressed_size, + disk_nbr_start, + internal_attrs, + external_attrs, + header_offset, + name: Cow::Borrowed(name), + extra: Cow::Borrowed(extra), + comment: Cow::Borrowed(comment), + }) + } +} + +impl CentralDirectoryFileHeader<'_> { + /// Returns true if the name or comment is not valid UTF-8 + pub fn is_non_utf8(&self) -> bool { + let (valid1, require1) = detect_utf8(&self.name[..]); + let (valid2, require2) = detect_utf8(&self.comment[..]); + if !valid1 || !valid2 { + // definitely not utf-8 + return true; + } + + if !require1 && !require2 { + // name and comment only use single-byte runes that overlap with UTF-8 + return false; + } + + // Might be UTF-8, might be some other encoding; preserve existing flag. + // Some ZIP writers use UTF-8 encoding without setting the UTF-8 flag. + // Since it is impossible to always distinguish valid UTF-8 from some + // other encoding (e.g., GBK or Shift-JIS), we trust the flag. + self.flags & 0x800 == 0 + } + + /// Converts the directory header into a entry: this involves + /// parsing the extra fields and converting the timestamps. + pub fn as_entry(&self, encoding: Encoding, global_offset: u64) -> Result { + let mut entry = Entry { + name: encoding.decode(&self.name[..])?, + method: self.method, + comment: encoding.decode(&self.comment[..])?, + modified: self.modified.to_datetime().unwrap_or_else(zero_datetime), + created: None, + accessed: None, + header_offset: self.header_offset as u64 + global_offset, + reader_version: self.reader_version, + flags: self.flags, + uid: None, + gid: None, + crc32: self.crc32, + compressed_size: self.compressed_size as _, + uncompressed_size: self.uncompressed_size as _, + mode: Mode(0), + }; + + entry.mode = match self.creator_version.host_system { + HostSystem::Unix | HostSystem::Osx => UnixMode(self.external_attrs >> 16).into(), + HostSystem::WindowsNtfs | HostSystem::Vfat | HostSystem::MsDos => { + MsdosMode(self.external_attrs).into() + } + _ => Mode(0), + }; + if entry.name.ends_with('/') { + // believe it or not, this is straight from the APPNOTE + entry.mode |= Mode::DIR + }; + + let settings = ExtraFieldSettings { + uncompressed_size_u32: self.uncompressed_size, + compressed_size_u32: self.compressed_size, + header_offset_u32: self.header_offset, + }; + + let mut slice = Partial::new(&self.extra[..]); + while !slice.is_empty() { + match ExtraField::mk_parser(settings).parse_next(&mut slice) { + Ok(ef) => { + entry.set_extra_field(&ef); + } + Err(e) => { + trace!("extra field error: {:#?}", e); + return Err(FormatError::InvalidExtraField.into()); + } + } + } + + Ok(entry) + } +} diff --git a/rc-zip/src/parse/date_time.rs b/rc-zip/src/parse/date_time.rs index 2ebdd87..d45ae94 100644 --- a/rc-zip/src/parse/date_time.rs +++ b/rc-zip/src/parse/date_time.rs @@ -2,6 +2,7 @@ use chrono::{ offset::{LocalResult, TimeZone, Utc}, DateTime, Timelike, }; +use ownable::{IntoOwned, ToOwned}; use std::fmt; use winnow::{ binary::{le_u16, le_u64}, @@ -11,7 +12,7 @@ use winnow::{ /// A timestamp in MS-DOS format /// /// Represents dates from year 1980 to 2180, with 2 second precision. -#[derive(Clone, Copy, Eq, PartialEq)] +#[derive(Clone, Copy, Eq, PartialEq, IntoOwned, ToOwned)] pub struct MsdosTimestamp { /// Time in 2-second intervals pub time: u16, diff --git a/rc-zip/src/parse/directory_header.rs b/rc-zip/src/parse/directory_header.rs deleted file mode 100644 index db38717..0000000 --- a/rc-zip/src/parse/directory_header.rs +++ /dev/null @@ -1,272 +0,0 @@ -use chrono::{offset::TimeZone, DateTime, Utc}; -use tracing::trace; -use winnow::{ - binary::{le_u16, le_u32}, - prelude::PResult, - token::tag, - Parser, Partial, -}; - -use crate::{ - encoding::detect_utf8, - encoding::Encoding, - error::{Error, FormatError}, - parse::{ - zero_datetime, Entry, ExtraField, ExtraFieldSettings, HostSystem, Mode, MsdosMode, - MsdosTimestamp, NtfsAttr, StoredEntry, StoredEntryInner, UnixMode, Version, ZipBytes, - ZipString, - }, -}; - -/// 4.3.12 Central directory structure: File header -pub struct DirectoryHeader { - /// version made by - pub creator_version: Version, - - /// version needed to extract - pub reader_version: Version, - - /// general purpose bit flag - pub flags: u16, - - /// compression method - pub method: u16, - - /// last mod file datetime - pub modified: MsdosTimestamp, - - /// crc32 hash - pub crc32: u32, - - /// compressed size - pub compressed_size: u32, - - /// uncompressed size - pub uncompressed_size: u32, - - /// disk number start - pub disk_nbr_start: u16, - - /// internal file attributes - pub internal_attrs: u16, - - /// external file attributes - pub external_attrs: u32, - - /// relative offset of local header - pub header_offset: u32, - - /// name - pub name: ZipString, // FIXME: should this be Cow? - - /// extra - pub extra: ZipBytes, // FIXME: should this be Cow<[u8]>? - - /// comment - pub comment: ZipString, -} - -impl DirectoryHeader { - const SIGNATURE: &'static str = "PK\x01\x02"; - - /// Parser for the central directory file header - pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - _ = tag(Self::SIGNATURE).parse_next(i)?; - let creator_version = Version::parser.parse_next(i)?; - let reader_version = Version::parser.parse_next(i)?; - let flags = le_u16.parse_next(i)?; - let method = le_u16.parse_next(i)?; - let modified = MsdosTimestamp::parser.parse_next(i)?; - let crc32 = le_u32.parse_next(i)?; - let compressed_size = le_u32.parse_next(i)?; - let uncompressed_size = le_u32.parse_next(i)?; - let name_len = le_u16.parse_next(i)?; - let extra_len = le_u16.parse_next(i)?; - let comment_len = le_u16.parse_next(i)?; - let disk_nbr_start = le_u16.parse_next(i)?; - let internal_attrs = le_u16.parse_next(i)?; - let external_attrs = le_u32.parse_next(i)?; - let header_offset = le_u32.parse_next(i)?; - - let name = ZipString::parser(name_len).parse_next(i)?; - let extra = ZipBytes::parser(extra_len).parse_next(i)?; - let comment = ZipString::parser(comment_len).parse_next(i)?; - - Ok(Self { - creator_version, - reader_version, - flags, - method, - modified, - crc32, - compressed_size, - uncompressed_size, - disk_nbr_start, - internal_attrs, - external_attrs, - header_offset, - name, - extra, - comment, - }) - } -} - -impl DirectoryHeader { - /// Returns true if the name or comment is not valid UTF-8 - pub fn is_non_utf8(&self) -> bool { - let (valid1, require1) = detect_utf8(&self.name.0[..]); - let (valid2, require2) = detect_utf8(&self.comment.0[..]); - if !valid1 || !valid2 { - // definitely not utf-8 - return true; - } - - if !require1 && !require2 { - // name and comment only use single-byte runes that overlap with UTF-8 - return false; - } - - // Might be UTF-8, might be some other encoding; preserve existing flag. - // Some ZIP writers use UTF-8 encoding without setting the UTF-8 flag. - // Since it is impossible to always distinguish valid UTF-8 from some - // other encoding (e.g., GBK or Shift-JIS), we trust the flag. - self.flags & 0x800 == 0 - } - - /// Converts the directory header into a stored entry: this involves - /// parsing the extra fields and converting the timestamps. - pub fn as_stored_entry( - &self, - is_zip64: bool, - encoding: Encoding, - global_offset: u64, - ) -> Result { - let mut comment: Option = None; - if let Some(comment_field) = self.comment.clone().into_option() { - comment = Some(encoding.decode(&comment_field.0)?); - } - - let name = encoding.decode(&self.name.0)?; - - let mut compressed_size = self.compressed_size as u64; - let mut uncompressed_size = self.uncompressed_size as u64; - let mut header_offset = self.header_offset as u64 + global_offset; - - let mut modified: Option> = None; - let mut created: Option> = None; - let mut accessed: Option> = None; - - let mut uid: Option = None; - let mut gid: Option = None; - - let mut extra_fields: Vec = Vec::new(); - - let settings = ExtraFieldSettings { - needs_compressed_size: self.compressed_size == !0u32, - needs_uncompressed_size: self.uncompressed_size == !0u32, - needs_header_offset: self.header_offset == !0u32, - }; - - let mut slice = Partial::new(&self.extra.0[..]); - while !slice.is_empty() { - match ExtraField::mk_parser(settings).parse_next(&mut slice) { - Ok(ef) => { - match &ef { - ExtraField::Zip64(z64) => { - if let Some(n) = z64.uncompressed_size { - uncompressed_size = n; - } - if let Some(n) = z64.compressed_size { - compressed_size = n; - } - if let Some(n) = z64.header_offset { - header_offset = n; - } - } - ExtraField::Timestamp(ts) => { - modified = Utc.timestamp_opt(ts.mtime as i64, 0).single(); - } - ExtraField::Ntfs(nf) => { - for attr in &nf.attrs { - // note: other attributes are unsupported - if let NtfsAttr::Attr1(attr) = attr { - modified = attr.mtime.to_datetime(); - created = attr.ctime.to_datetime(); - accessed = attr.atime.to_datetime(); - } - } - } - ExtraField::Unix(uf) => { - modified = Utc.timestamp_opt(uf.mtime as i64, 0).single(); - if uid.is_none() { - uid = Some(uf.uid as u32); - } - if gid.is_none() { - gid = Some(uf.gid as u32); - } - } - ExtraField::NewUnix(uf) => { - uid = Some(uf.uid as u32); - gid = Some(uf.uid as u32); - } - _ => {} - }; - extra_fields.push(ef); - } - Err(e) => { - trace!("extra field error: {:#?}", e); - return Err(FormatError::InvalidExtraField.into()); - } - } - } - - let modified = match modified { - Some(m) => Some(m), - None => self.modified.to_datetime(), - }; - - let mut mode: Mode = match self.creator_version.host_system() { - HostSystem::Unix | HostSystem::Osx => UnixMode(self.external_attrs >> 16).into(), - HostSystem::WindowsNtfs | HostSystem::Vfat | HostSystem::MsDos => { - MsdosMode(self.external_attrs).into() - } - _ => Mode(0), - }; - if name.ends_with('/') { - // believe it or not, this is straight from the APPNOTE - mode |= Mode::DIR - }; - - Ok(StoredEntry { - entry: Entry { - name, - method: self.method.into(), - comment, - modified: modified.unwrap_or_else(zero_datetime), - created, - accessed, - }, - - creator_version: self.creator_version, - reader_version: self.reader_version, - flags: self.flags, - - inner: StoredEntryInner { - crc32: self.crc32, - compressed_size, - uncompressed_size, - is_zip64, - }, - header_offset, - - uid, - gid, - mode, - - extra_fields, - - external_attrs: self.external_attrs, - }) - } -} diff --git a/rc-zip/src/parse/eocd.rs b/rc-zip/src/parse/eocd.rs index 386b091..ae1692c 100644 --- a/rc-zip/src/parse/eocd.rs +++ b/rc-zip/src/parse/eocd.rs @@ -1,3 +1,6 @@ +use std::borrow::Cow; + +use ownable::{traits as ownable_traits, IntoOwned, ToOwned}; use tracing::trace; use winnow::{ binary::{le_u16, le_u32, le_u64, length_take}, @@ -6,14 +9,11 @@ use winnow::{ PResult, Parser, Partial, }; -use crate::{ - error::{Error, FormatError}, - parse::ZipString, -}; +use crate::error::{Error, FormatError}; /// 4.3.16 End of central directory record: -#[derive(Debug)] -pub struct EndOfCentralDirectoryRecord { +#[derive(Debug, ToOwned, IntoOwned, Clone)] +pub struct EndOfCentralDirectoryRecord<'a> { /// number of this disk pub disk_nbr: u16, @@ -33,16 +33,16 @@ pub struct EndOfCentralDirectoryRecord { pub directory_offset: u32, /// .ZIP file comment - pub comment: ZipString, + pub comment: Cow<'a, [u8]>, } -impl EndOfCentralDirectoryRecord { +impl<'a> EndOfCentralDirectoryRecord<'a> { /// Does not include comment size & comment data const MIN_LENGTH: usize = 20; const SIGNATURE: &'static str = "PK\x05\x06"; /// Find the end of central directory record in a block of data - pub fn find_in_block(b: &[u8]) -> Option> { + pub fn find_in_block(b: &'a [u8]) -> Option> { for i in (0..(b.len() - Self::MIN_LENGTH + 1)).rev() { let mut input = Partial::new(&b[i..]); if let Ok(directory) = Self::parser.parse_next(&mut input) { @@ -56,7 +56,7 @@ impl EndOfCentralDirectoryRecord { } /// Parser for the end of central directory record - pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + pub fn parser(i: &mut Partial<&'a [u8]>) -> PResult { let _ = tag(Self::SIGNATURE).parse_next(i)?; seq! {Self { disk_nbr: le_u16, @@ -65,7 +65,7 @@ impl EndOfCentralDirectoryRecord { directory_records: le_u16, directory_size: le_u32, directory_offset: le_u32, - comment: length_take(le_u16).map(ZipString::from), + comment: length_take(le_u16).map(Cow::Borrowed), }} .parse_next(i) } @@ -100,7 +100,7 @@ impl EndOfCentralDirectory64Locator { } /// 4.3.14 Zip64 end of central directory record -#[derive(Debug)] +#[derive(Debug, Clone, ToOwned, IntoOwned)] pub struct EndOfCentralDirectory64Record { /// size of zip64 end of central directory record pub record_size: u64, @@ -153,7 +153,7 @@ impl EndOfCentralDirectory64Record { } /// A zip structure and its location in the input file -#[derive(Debug)] +#[derive(Debug, Clone)] pub struct Located { /// Absolute by offset from the start of the file pub offset: u64, @@ -162,23 +162,39 @@ pub struct Located { pub inner: T, } -impl std::ops::Deref for Located { - type Target = T; - fn deref(&self) -> &Self::Target { - &self.inner +impl ownable_traits::ToOwned for Located +where + T: ownable_traits::ToOwned, +{ + type Owned = Located; + + fn to_owned(&self) -> Self::Owned { + Located { + offset: self.offset, + inner: self.inner.to_owned(), + } } } -impl std::ops::DerefMut for Located { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut self.inner +impl ownable_traits::IntoOwned for Located +where + T: ownable_traits::IntoOwned, +{ + type Owned = Located; + + fn into_owned(self) -> Self::Owned { + Located { + offset: self.offset, + inner: self.inner.into_owned(), + } } } /// Coalesces zip and zip64 "end of central directory" record info -pub struct EndOfCentralDirectory { +#[derive(ToOwned, IntoOwned)] +pub struct EndOfCentralDirectory<'a> { /// The end of central directory record - pub dir: Located, + pub dir: Located>, /// The zip64 end of central directory record pub dir64: Option>, @@ -188,10 +204,10 @@ pub struct EndOfCentralDirectory { pub global_offset: i64, } -impl EndOfCentralDirectory { +impl<'a> EndOfCentralDirectory<'a> { pub(crate) fn new( size: u64, - dir: Located, + dir: Located>, dir64: Option>, ) -> Result { let mut res = Self { @@ -219,7 +235,7 @@ impl EndOfCentralDirectory { // // (e.g. https://www.icculus.org/mojosetup/ installers are ELF binaries with a .zip file appended) // - // `directory_end_offfset` is found by scanning the file (so it accounts for padding), but + // `directory_end_offset` is found by scanning the file (so it accounts for padding), but // `directory_offset` is found by reading a data structure (so it does not account for padding). // If we just trusted `directory_offset`, we'd be reading the central directory at the wrong place: // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -266,37 +282,37 @@ impl EndOfCentralDirectory { #[inline] pub(crate) fn directory_offset(&self) -> u64 { match self.dir64.as_ref() { - Some(d64) => d64.directory_offset, - None => self.dir.directory_offset as u64, + Some(d64) => d64.inner.directory_offset, + None => self.dir.inner.directory_offset as u64, } } #[inline] pub(crate) fn directory_size(&self) -> u64 { match self.dir64.as_ref() { - Some(d64) => d64.directory_size, - None => self.dir.directory_size as u64, + Some(d64) => d64.inner.directory_size, + None => self.dir.inner.directory_size as u64, } } #[inline] pub(crate) fn set_directory_offset(&mut self, offset: u64) { match self.dir64.as_mut() { - Some(d64) => d64.directory_offset = offset, - None => self.dir.directory_offset = offset as u32, + Some(d64) => d64.inner.directory_offset = offset, + None => self.dir.inner.directory_offset = offset as u32, }; } #[inline] pub(crate) fn directory_records(&self) -> u64 { match self.dir64.as_ref() { - Some(d64) => d64.directory_records, - None => self.dir.directory_records as u64, + Some(d64) => d64.inner.directory_records, + None => self.dir.inner.directory_records as u64, } } #[inline] - pub(crate) fn comment(&self) -> &ZipString { - &self.dir.comment + pub(crate) fn comment(&self) -> &[u8] { + &self.dir.inner.comment } } diff --git a/rc-zip/src/parse/extra_field.rs b/rc-zip/src/parse/extra_field.rs index 9b3693b..40597fb 100644 --- a/rc-zip/src/parse/extra_field.rs +++ b/rc-zip/src/parse/extra_field.rs @@ -1,14 +1,17 @@ +use std::borrow::Cow; + +use ownable::{IntoOwned, ToOwned}; use tracing::trace; use winnow::{ binary::{le_u16, le_u32, le_u64, le_u8, length_take}, - combinator::{cond, opt, preceded, repeat_till}, + combinator::{opt, preceded, repeat_till}, error::{ErrMode, ErrorKind, ParserError, StrContext}, seq, token::{tag, take}, PResult, Parser, Partial, }; -use crate::parse::{NtfsTimestamp, ZipBytes}; +use crate::parse::NtfsTimestamp; /// 4.4.28 extra field: (Variable) pub(crate) struct ExtraFieldRecord<'a> { @@ -26,19 +29,30 @@ impl<'a> ExtraFieldRecord<'a> { } } -// Useful because zip64 extended information extra field has fixed order *but* -// optional fields. From the appnote: -// -// If one of the size or offset fields in the Local or Central directory record -// is too small to hold the required data, a Zip64 extended information record -// is created. The order of the fields in the zip64 extended information record -// is fixed, but the fields MUST only appear if the corresponding Local or -// Central directory record field is set to 0xFFFF or 0xFFFFFFFF. +/// Useful because zip64 extended information extra field has fixed order *but* +/// optional fields. From the appnote: +/// +/// If one of the size or offset fields in the Local or Central directory record +/// is too small to hold the required data, a Zip64 extended information record +/// is created. The order of the fields in the zip64 extended information record +/// is fixed, but the fields MUST only appear if the corresponding Local or +/// Central directory record field is set to 0xFFFF or 0xFFFFFFFF. #[derive(Debug, Clone, Copy)] -pub(crate) struct ExtraFieldSettings { - pub(crate) needs_uncompressed_size: bool, - pub(crate) needs_compressed_size: bool, - pub(crate) needs_header_offset: bool, +pub struct ExtraFieldSettings { + /// The uncompressed size field read from a local or central directory record + /// If this is 0xFFFF_FFFF, then the zip64 extra field uncompressed size + /// field will be present. + pub uncompressed_size_u32: u32, + + /// The compressed size field read from a local or central directory record + /// If this is 0xFFFF_FFFF, then the zip64 extra field compressed size + /// field will be present. + pub compressed_size_u32: u32, + + /// The header offset field read from a central directory record (or zero + /// for local directory records). If this is 0xFFFF_FFFF, then the zip64 + /// extra field header offset field will be present. + pub header_offset_u32: u32, } /// Information stored in the central directory header `extra` field @@ -47,13 +61,13 @@ pub(crate) struct ExtraFieldSettings { /// /// See `extrafld.txt` in this crate's source distribution. #[derive(Clone)] -pub enum ExtraField { +pub enum ExtraField<'a> { /// Zip64 extended information extra field Zip64(ExtraZip64Field), /// Extended timestamp Timestamp(ExtraTimestampField), /// UNIX & Info-Zip UNIX - Unix(ExtraUnixField), + Unix(ExtraUnixField<'a>), /// New UNIX extra field NewUnix(ExtraNewUnixField), /// NTFS (Win9x/WinNT FileTimes) @@ -65,14 +79,20 @@ pub enum ExtraField { }, } -impl ExtraField { - pub(crate) fn mk_parser( +impl<'a> ExtraField<'a> { + /// Make a parser for extra fields, given the settings for the zip64 extra + /// field (which depend on whether the u32 values are 0xFFFF_FFFF or not) + pub fn mk_parser( settings: ExtraFieldSettings, - ) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult { + ) -> impl FnMut(&mut Partial<&'a [u8]>) -> PResult { move |i| { use ExtraField as EF; let rec = ExtraFieldRecord::parser.parse_next(i)?; - trace!("parsing extra field record, tag {:04x}", rec.tag); + trace!( + "parsing extra field record, tag {:04x}, len {}", + rec.tag, + rec.payload.len() + ); let payload = &mut Partial::new(rec.payload); let variant = match rec.tag { @@ -83,7 +103,7 @@ impl ExtraField { .context(StrContext::Label("timestamp")) .parse_next(payload)?, ExtraNtfsField::TAG => { - opt(ExtraNtfsField::parse.map(EF::Ntfs)).parse_next(payload)? + opt(ExtraNtfsField::parser.map(EF::Ntfs)).parse_next(payload)? } ExtraUnixField::TAG | ExtraUnixField::TAG_INFOZIP => { opt(ExtraUnixField::parser.map(EF::Unix)).parse_next(payload)? @@ -104,13 +124,16 @@ impl ExtraField { #[derive(Clone, Default)] pub struct ExtraZip64Field { /// 64-bit uncompressed size - pub uncompressed_size: Option, + pub uncompressed_size: u64, /// 64-bit compressed size - pub compressed_size: Option, + pub compressed_size: u64, /// 64-bit header offset - pub header_offset: Option, + pub header_offset: u64, + + /// 32-bit disk start number + pub disk_start: Option, } impl ExtraZip64Field { @@ -120,13 +143,29 @@ impl ExtraZip64Field { settings: ExtraFieldSettings, ) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult { move |i| { - // N.B: we ignore "disk start number" - seq! {Self { - uncompressed_size: cond(settings.needs_uncompressed_size, le_u64), - compressed_size: cond(settings.needs_compressed_size, le_u64), - header_offset: cond(settings.needs_header_offset, le_u64), - }} - .parse_next(i) + let uncompressed_size = if settings.uncompressed_size_u32 == 0xFFFF_FFFF { + le_u64.parse_next(i)? + } else { + settings.uncompressed_size_u32 as u64 + }; + let compressed_size = if settings.compressed_size_u32 == 0xFFFF_FFFF { + le_u64.parse_next(i)? + } else { + settings.compressed_size_u32 as u64 + }; + let header_offset = if settings.header_offset_u32 == 0xFFFF_FFFF { + le_u64.parse_next(i)? + } else { + settings.header_offset_u32 as u64 + }; + let disk_start = opt(le_u32.complete_err()).parse_next(i)?; + + Ok(Self { + uncompressed_size, + compressed_size, + header_offset, + disk_start, + }) } } } @@ -152,8 +191,8 @@ impl ExtraTimestampField { } /// 4.5.7 -UNIX Extra Field (0x000d): -#[derive(Clone)] -pub struct ExtraUnixField { +#[derive(Clone, ToOwned, IntoOwned)] +pub struct ExtraUnixField<'a> { /// file last access time pub atime: u32, /// file last modification time @@ -163,21 +202,21 @@ pub struct ExtraUnixField { /// file group id pub gid: u16, /// variable length data field - pub data: ZipBytes, + pub data: Cow<'a, [u8]>, } -impl ExtraUnixField { +impl<'a> ExtraUnixField<'a> { const TAG: u16 = 0x000d; const TAG_INFOZIP: u16 = 0x5855; - fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + fn parser(i: &mut Partial<&'a [u8]>) -> PResult { let t_size = le_u16.parse_next(i)? - 12; seq! {Self { atime: le_u32, mtime: le_u32, uid: le_u16, gid: le_u16, - data: ZipBytes::parser(t_size), + data: take(t_size).map(Cow::Borrowed), }} .parse_next(i) } @@ -247,7 +286,7 @@ pub struct ExtraNtfsField { impl ExtraNtfsField { const TAG: u16 = 0x000a; - fn parse(i: &mut Partial<&'_ [u8]>) -> PResult { + fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { let _ = take(4_usize).parse_next(i)?; // reserved (unused) seq! {Self { // from the winnow docs: @@ -255,7 +294,7 @@ impl ExtraNtfsField { // data or the end of the stream, causing them to always report // Incomplete. // using repeat_till with eof combinator to work around this: - attrs: repeat_till(0.., NtfsAttr::parse, winnow::combinator::eof).map(|x| x.0), + attrs: repeat_till(0.., NtfsAttr::parser, winnow::combinator::eof).map(|x| x.0), }} .parse_next(i) } @@ -275,7 +314,7 @@ pub enum NtfsAttr { } impl NtfsAttr { - fn parse(i: &mut Partial<&'_ [u8]>) -> PResult { + fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { let tag = le_u16.parse_next(i)?; trace!("parsing NTFS attribute, tag {:04x}", tag); let payload = length_take(le_u16).parse_next(i)?; diff --git a/rc-zip/src/parse/local.rs b/rc-zip/src/parse/local_headers.rs similarity index 68% rename from rc-zip/src/parse/local.rs rename to rc-zip/src/parse/local_headers.rs index fc73ef6..debf98a 100644 --- a/rc-zip/src/parse/local.rs +++ b/rc-zip/src/parse/local_headers.rs @@ -1,20 +1,27 @@ +use std::borrow::Cow; + use crate::{ - error::{Error, UnsupportedError}, - parse::{Method, MsdosTimestamp, Version, ZipBytes, ZipString}, + encoding::{detect_utf8, Encoding}, + error::{Error, FormatError, UnsupportedError}, + parse::{Method, MsdosTimestamp, Version}, }; +use ownable::{IntoOwned, ToOwned}; +use tracing::trace; use winnow::{ binary::{le_u16, le_u32, le_u64, le_u8}, combinator::opt, error::{ContextError, ErrMode, ErrorKind, FromExternalError}, seq, - token::tag, + token::{tag, take}, PResult, Parser, Partial, }; -#[derive(Debug)] +use super::{zero_datetime, Entry, ExtraField, ExtraFieldSettings, Mode}; + +#[derive(Debug, ToOwned, IntoOwned)] /// 4.3.7 Local file header -pub struct LocalFileHeaderRecord { +pub struct LocalFileHeader<'a> { /// version needed to extract pub reader_version: Version, @@ -37,16 +44,16 @@ pub struct LocalFileHeaderRecord { pub uncompressed_size: u32, /// file name - pub name: ZipString, + pub name: Cow<'a, [u8]>, /// extra field - pub extra: ZipBytes, + pub extra: Cow<'a, [u8]>, /// method-specific fields pub method_specific: MethodSpecific, } -#[derive(Debug)] +#[derive(Debug, ToOwned, IntoOwned)] /// Method-specific properties following the local file header pub enum MethodSpecific { /// No method-specific properties @@ -56,12 +63,12 @@ pub enum MethodSpecific { Lzma(LzmaProperties), } -impl LocalFileHeaderRecord { +impl<'a> LocalFileHeader<'a> { /// The signature for a local file header pub const SIGNATURE: &'static str = "PK\x03\x04"; /// Parser for the local file header - pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { + pub fn parser(i: &mut Partial<&'a [u8]>) -> PResult { let _ = tag(Self::SIGNATURE).parse_next(i)?; let reader_version = Version::parser.parse_next(i)?; @@ -75,8 +82,8 @@ impl LocalFileHeaderRecord { let name_len = le_u16.parse_next(i)?; let extra_len = le_u16.parse_next(i)?; - let name = ZipString::parser(name_len).parse_next(i)?; - let extra = ZipBytes::parser(extra_len).parse_next(i)?; + let name = take(name_len).parse_next(i).map(Cow::Borrowed)?; + let extra = take(extra_len).parse_next(i).map(Cow::Borrowed)?; let method_specific = match method { Method::Lzma => { @@ -114,6 +121,62 @@ impl LocalFileHeaderRecord { // purpose bit flag is set (see below). self.flags & 0b1000 != 0 } + + /// Converts the local file header into an entry. + pub fn as_entry(&self) -> Result { + // see APPNOTE 4.4.4: Bit 11 is the language encoding flag (EFS) + let has_utf8_flag = self.flags & 0x800 == 0; + let encoding = if has_utf8_flag && detect_utf8(&self.name[..]).0 { + Encoding::Utf8 + } else { + Encoding::Cp437 + }; + let name = encoding.decode(&self.name[..])?; + + let mut entry = Entry { + name, + method: self.method, + comment: Default::default(), + modified: self.modified.to_datetime().unwrap_or_else(zero_datetime), + created: None, + accessed: None, + header_offset: 0, + reader_version: self.reader_version, + flags: self.flags, + uid: None, + gid: None, + crc32: self.crc32, + compressed_size: self.compressed_size as _, + uncompressed_size: self.uncompressed_size as _, + mode: Mode(0), + }; + + if entry.name.ends_with('/') { + // believe it or not, this is straight from the APPNOTE + entry.mode |= Mode::DIR + }; + + let mut slice = Partial::new(&self.extra[..]); + let settings = ExtraFieldSettings { + compressed_size_u32: self.compressed_size, + uncompressed_size_u32: self.uncompressed_size, + header_offset_u32: 0, + }; + + while !slice.is_empty() { + match ExtraField::mk_parser(settings).parse_next(&mut slice) { + Ok(ef) => { + entry.set_extra_field(&ef); + } + Err(e) => { + trace!("extra field error: {:#?}", e); + return Err(FormatError::InvalidExtraField.into()); + } + } + } + + Ok(entry) + } } /// 4.3.9 Data descriptor: @@ -163,7 +226,7 @@ impl DataDescriptorRecord { } /// 5.8.5 LZMA Properties header -#[derive(Debug)] +#[derive(Debug, ToOwned, IntoOwned)] pub struct LzmaProperties { /// major version pub major: u8, diff --git a/rc-zip/src/parse/mod.rs b/rc-zip/src/parse/mod.rs index 962c24e..6a7240e 100644 --- a/rc-zip/src/parse/mod.rs +++ b/rc-zip/src/parse/mod.rs @@ -22,14 +22,11 @@ pub use version::*; mod date_time; pub use date_time::*; -mod directory_header; -pub use directory_header::*; +mod central_directory_file_header; +pub use central_directory_file_header::*; mod eocd; pub use eocd::*; -mod local; -pub use local::*; - -mod raw; -pub use raw::*; +mod local_headers; +pub use local_headers::*; diff --git a/rc-zip/src/parse/raw.rs b/rc-zip/src/parse/raw.rs deleted file mode 100644 index fb978ab..0000000 --- a/rc-zip/src/parse/raw.rs +++ /dev/null @@ -1,77 +0,0 @@ -use pretty_hex::PrettyHex; -use std::fmt; -use winnow::{stream::ToUsize, token::take, PResult, Parser, Partial}; - -/// A raw zip string, with no specific encoding. -/// -/// This is used while parsing a zip archive's central directory, -/// before we know what encoding is used. -#[derive(Clone)] -pub struct ZipString(pub Vec); - -impl<'a> From<&'a [u8]> for ZipString { - fn from(slice: &'a [u8]) -> Self { - Self(slice.into()) - } -} - -impl fmt::Debug for ZipString { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match std::str::from_utf8(&self.0) { - Ok(s) => write!(f, "{:?}", s), - Err(_) => write!(f, "[non-utf8 string: {}]", self.0.hex_dump()), - } - } -} - -impl ZipString { - pub(crate) fn parser(count: C) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult - where - C: ToUsize, - { - let count = count.to_usize(); - move |i| (take(count).map(|slice: &[u8]| Self(slice.into()))).parse_next(i) - } - - pub(crate) fn into_option(self) -> Option { - if !self.0.is_empty() { - Some(self) - } else { - None - } - } -} - -/// A raw u8 slice, with no specific structure. -/// -/// This is used while parsing a zip archive, when we want -/// to retain an owned slice to be parsed later. -#[derive(Clone)] -pub struct ZipBytes(pub Vec); - -impl fmt::Debug for ZipBytes { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - const MAX_SHOWN_SIZE: usize = 10; - let data = &self.0[..]; - let (slice, extra) = if data.len() > MAX_SHOWN_SIZE { - (&self.0[..MAX_SHOWN_SIZE], Some(data.len() - MAX_SHOWN_SIZE)) - } else { - (&self.0[..], None) - }; - write!(f, "{}", slice.hex_dump())?; - if let Some(extra) = extra { - write!(f, " (+ {} bytes)", extra)?; - } - Ok(()) - } -} - -impl ZipBytes { - pub(crate) fn parser(count: C) -> impl FnMut(&mut Partial<&'_ [u8]>) -> PResult - where - C: ToUsize, - { - let count = count.to_usize(); - move |i| (take(count).map(|slice: &[u8]| Self(slice.into()))).parse_next(i) - } -} diff --git a/rc-zip/src/parse/version.rs b/rc-zip/src/parse/version.rs index 1b9ac8f..4a46204 100644 --- a/rc-zip/src/parse/version.rs +++ b/rc-zip/src/parse/version.rs @@ -1,5 +1,7 @@ +use num_enum::{FromPrimitive, IntoPrimitive}; +use ownable::{IntoOwned, ToOwned}; use std::fmt; -use winnow::{binary::le_u16, PResult, Parser, Partial}; +use winnow::{binary::le_u8, seq, PResult, Parser, Partial}; /// A zip version (either created by, or required when reading an archive). /// @@ -7,127 +9,105 @@ use winnow::{binary::le_u16, PResult, Parser, Partial}; /// which features are required when reading a file. /// /// For more information, see the [.ZIP Application Note](https://support.pkware.com/display/PKZIP/APPNOTE), section 4.4.2. -#[derive(Clone, Copy, PartialEq, Eq, Hash)] -pub struct Version(pub u16); +#[derive(Clone, Copy, ToOwned, IntoOwned, PartialEq, Eq, Hash)] +pub struct Version { + /// The host system on which + pub host_system: HostSystem, + + /// Integer version, e.g. 45 for Zip version 4.5 + /// See APPNOTE, section 4.4.2.1 + pub version: u8, +} impl fmt::Debug for Version { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!( - f, - "{:?} v{}.{}", - self.host_system(), - self.major(), - self.minor() - ) + write!(f, "{:?} v{}", self.host_system, self.version) } } impl Version { /// Parse a version from a byte slice pub fn parser(i: &mut Partial<&'_ [u8]>) -> PResult { - le_u16.map(Self).parse_next(i) - } - - /// Identifies the host system on which the zip attributes are compatible. - pub fn host_system(&self) -> HostSystem { - match self.host() { - 0 => HostSystem::MsDos, - 1 => HostSystem::Amiga, - 2 => HostSystem::OpenVms, - 3 => HostSystem::Unix, - 4 => HostSystem::VmCms, - 5 => HostSystem::AtariSt, - 6 => HostSystem::Os2Hpfs, - 7 => HostSystem::Macintosh, - 8 => HostSystem::ZSystem, - 9 => HostSystem::CpM, - 10 => HostSystem::WindowsNtfs, - 11 => HostSystem::Mvs, - 12 => HostSystem::Vse, - 13 => HostSystem::AcornRisc, - 14 => HostSystem::Vfat, - 15 => HostSystem::AlternateMvs, - 16 => HostSystem::BeOs, - 17 => HostSystem::Tandem, - 18 => HostSystem::Os400, - 19 => HostSystem::Osx, - n => HostSystem::Unknown(n), - } - } - - /// Integer host system - pub fn host(&self) -> u8 { - (self.0 >> 8) as u8 - } - - /// Integer version, e.g. 45 for Zip version 4.5 - pub fn version(&self) -> u8 { - (self.0 & 0xff) as u8 - } - - /// ZIP specification major version - /// - /// See APPNOTE, section 4.4.2.1 - pub fn major(&self) -> u32 { - self.version() as u32 / 10 - } - - /// ZIP specification minor version - /// - /// See APPNOTE, section 4.4.2.1 - pub fn minor(&self) -> u32 { - self.version() as u32 % 10 + seq! {Self { + version: le_u8, + host_system: le_u8.map(HostSystem::from), + }} + .parse_next(i) } } /// System on which an archive was created, as encoded into a version u16. /// /// See APPNOTE, section 4.4.2.2 -#[derive(Debug)] +#[derive( + Debug, Clone, Copy, IntoPrimitive, FromPrimitive, ToOwned, IntoOwned, PartialEq, Eq, Hash, +)] +#[repr(u8)] pub enum HostSystem { /// MS-DOS and OS/2 (FAT / VFAT / FAT32 file systems) - MsDos, + MsDos = 0, + /// Amiga - Amiga, + Amiga = 1, + /// OpenVMS - OpenVms, + OpenVms = 2, + /// UNIX - Unix, + Unix = 3, + /// VM/CMS - VmCms, + VmCms = 4, + /// Atari ST - AtariSt, + AtariSt = 5, + /// OS/2 H.P.F.S - Os2Hpfs, + Os2Hpfs = 6, + /// Macintosh (see `Osx`) - Macintosh, + Macintosh = 7, + /// Z-System - ZSystem, + ZSystem = 8, + /// CP/M - CpM, + CpM = 9, + /// Windows NTFS - WindowsNtfs, + WindowsNtfs = 10, + /// MVS (OS/390 - Z/OS) - Mvs, + Mvs = 11, + /// VSE - Vse, + Vse = 12, + /// Acorn Risc - AcornRisc, + AcornRisc = 13, + /// VFAT - Vfat, + Vfat = 14, + /// alternate MVS - AlternateMvs, + AlternateMvs = 15, + /// BeOS - BeOs, + BeOs = 16, + /// Tandem - Tandem, + Tandem = 17, + /// OS/400 - Os400, + Os400 = 18, + /// OS X (Darwin) - Osx, + Osx = 19, + /// Unknown host system /// /// Values 20 through 255 are currently unused, as of - /// APPNOTE.TXT 6.3.6 (April 26, 2019) + /// APPNOTE.TXT 6.3.10 + #[num_enum(catch_all)] Unknown(u8), } diff --git a/rc-zip/tests/integration_tests.rs b/rc-zip/tests/integration_tests.rs index 8078b1c..2e61a95 100644 --- a/rc-zip/tests/integration_tests.rs +++ b/rc-zip/tests/integration_tests.rs @@ -9,17 +9,18 @@ use rc_zip::{ fn state_machine() { let cases = corpus::test_cases(); let case = cases.iter().find(|x| x.name == "zip64.zip").unwrap(); - let bs = std::fs::read(case.absolute_path()).unwrap(); - let mut fsm = ArchiveFsm::new(bs.len() as u64); + let bytes = case.bytes(); + + let mut fsm = ArchiveFsm::new(bytes.len() as u64); let archive = 'read_zip: loop { if let Some(offset) = fsm.wants_read() { let increment = 128usize; let offset = offset as usize; - let slice = if offset + increment > bs.len() { - &bs[offset..] + let slice = if offset + increment > bytes.len() { + &bytes[offset..] } else { - &bs[offset..offset + increment] + &bytes[offset..offset + increment] }; let len = cmp::min(slice.len(), fsm.space().len()); diff --git a/testdata/meta.zip b/testdata/meta.zip new file mode 100644 index 0000000..d5762f3 Binary files /dev/null and b/testdata/meta.zip differ diff --git a/testdata/wine-zeroed.zip.bz2 b/testdata/wine-zeroed.zip.bz2 new file mode 100644 index 0000000..1515dfb Binary files /dev/null and b/testdata/wine-zeroed.zip.bz2 differ