From 550d4c187684481a8e430205e309a25a087d8b3e Mon Sep 17 00:00:00 2001 From: "Elf M. Sternberg" Date: Sat, 26 Nov 2022 16:49:25 -0800 Subject: [PATCH] Intermediate progress: Squozen I realized that the C version of this thing does multiple things in the same function: it loads the bigrams, it iterates through the database, and it compares the things found in the database to the prepared pattern. It seems to me, therefore, that we're better off with an instance that loads the bigrams, then closes the database immediately. Later, the client can ask for one of two iterators: one that either returns each entry in sequence, or one that returns each entry in sequence that matches the pattern passed in. --- .gitignore | 3 +- Cargo.toml | 1 + crates/squozen/Cargo.toml | 1 + crates/squozen/src/lib.rs | 4 +- crates/squozen/src/prepare_pattern.rs | 7 ++ crates/squozen/src/squozen.rs | 106 ++++++++++++++++++++++++++ docs/NOTES.md | 3 + 7 files changed, 123 insertions(+), 2 deletions(-) create mode 100644 crates/squozen/src/squozen.rs create mode 100644 docs/NOTES.md diff --git a/.gitignore b/.gitignore index 527985c..5a63621 100644 --- a/.gitignore +++ b/.gitignore @@ -5,5 +5,6 @@ **/*.rs.bk target Cargo.lock - +crates/squozen/docs/patprep/bench_patprep +crates/squozen/docs/patprep/rust_examples .ccls-cache diff --git a/Cargo.toml b/Cargo.toml index 997ec56..21e72f5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,4 +14,5 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +libc = "0.2.137" diff --git a/crates/squozen/Cargo.toml b/crates/squozen/Cargo.toml index 75febcb..e57412c 100644 --- a/crates/squozen/Cargo.toml +++ b/crates/squozen/Cargo.toml @@ -11,6 +11,7 @@ path = "src/lib.rs" [dependencies] fnmatch-sys = "1.0.0" +libc = "0.2.137" [[bin]] name = "bench_patprep" diff --git a/crates/squozen/src/lib.rs b/crates/squozen/src/lib.rs index 8fd9eb2..9b7e5ed 100644 --- a/crates/squozen/src/lib.rs +++ b/crates/squozen/src/lib.rs @@ -1,3 +1,5 @@ // pub use crate::codesquoze; -// pub mod squozen; +extern crate libc; + pub mod prepare_pattern; +pub mod squozen; diff --git a/crates/squozen/src/prepare_pattern.rs b/crates/squozen/src/prepare_pattern.rs index 1237f17..07f186c 100644 --- a/crates/squozen/src/prepare_pattern.rs +++ b/crates/squozen/src/prepare_pattern.rs @@ -13,6 +13,13 @@ const GLOBSTARTS: &[u8] = &[b'?', b'*', b']']; // only the content of the pattern, and no nulls at either end, relying instead // on Rust's tracking the size of slices internally. +// One of the biggest changes between this and the original is that the original +// used "not pointing within the legal slice" as a sentinel for exceeding the +// bounds of the searchable space; since Rust doesn't allow that, we have to +// test ahead of time if the condition in which the original would have exceeded +// the legal search space is met, and short-circuit the return value at that +// point. + fn hunt(name: &[u8], end: usize, alt: usize, comp: F) -> usize where F: Fn(&u8) -> bool, diff --git a/crates/squozen/src/squozen.rs b/crates/squozen/src/squozen.rs new file mode 100644 index 0000000..524020e --- /dev/null +++ b/crates/squozen/src/squozen.rs @@ -0,0 +1,106 @@ +use libc; +use std::fs::File; +use std::io::{Bufreader, Bytes}; +use std::path::Path; + +const PARITY: u8 = 0o200; +const RECORD_SEPARATOR: u8 = 30; +const OFFSET: u8 = 14; +const BIGRAMS: usize = BIGRAMS; + +#[derive(Clone, Debug)] +pub struct Squozen { + bigrams: [char; BIGRAMS], + path: Path, +} + +pub impl Squozen { + pub fn new(filename: Path) -> Result { + let mut dbfile = File::open(filename)?; + let mut db = Bufreader::new(dbfile); + let mut bigrams: [char; BIGRAMS] = [0; BIGRAMS]; + db.read_exact(&mut bigrams)?; + Ok(Squozen { + bigrams, + path: filename.clone(), + }) + } + + pub fn paths(&mut self) -> StoredPath { + StoredPath::new(&self); + } + + /* + pub fn iter(&mut self) -> StoredPath { + FoundPath::new(&self); + } + */ +} + +pub struct StoredPath<'a> { + source: Squozen, + path: [char; libc::PATH_MAX], + db: Bytes, + ch: u8, + last: usize, + found: bool, +} + +pub impl<'a> StoredPath { + pub fn new(squozen: Squozen) -> StoredPath { + let mut dbfile = File::open(&squozen.path)?; + let mut dbbuffer = Bufreader::new(dbfile); + dbbuffer.seek(BIGRAMS); + let mut db = dbbuffer.bytes(); + let mut ch = db.next()?; + StoredPath { + squozen, + db, + ch, + path: [0; libc::PATH_MAX], + last: 0, + } + } + + // Note that in either case, the file pointer will be pointing at the first + // valid character of the database, or it will be over. + pub fn getw(&mut self) -> Result { + let ch1 = self.db.next()?; + let ch2 = self.db.next()?; + Ok(u16::from_le_bytes(&[ch1, ch2])) + } + + pub fn get_offset(&mut self) -> Result { + if self.ch == RECORD_SEPARATOR { + let offset = self.getw()?; + Ok(usize::from(offset)) + } else { + Ok(usize::from(self.ch)) + } + } +} + +pub impl<'a> Iterator for StoredPath { + type Item = &[char; libc::PATH_MAX]; + + fn next(&mut self) -> Option<&Self::Item> { + let offset = self.get_offset(); + let position = 0; + loop { + self.ch = self.db.next()?; + if self.ch <= RECORD_SEPARATOR { + break; + } + if self.ch < PARITY { + self.path[self.last + position] = ch; + position += 1; + } else { + let bg = self.ch & PARITY - 1; + self.path[self.last + position] = self.squozen.bigrams[bg * 2]; + self.path[self.last + position + 1] = self.squozen.bigrams[bg * 2 + 1]; + position += 2; + } + } + Some(&self.path) + } +} diff --git a/docs/NOTES.md b/docs/NOTES.md new file mode 100644 index 0000000..50a1f28 --- /dev/null +++ b/docs/NOTES.md @@ -0,0 +1,3 @@ +Errors found: + +- Unparsable/unusable search pattern