Intermediate progress: Squozen

I realized that the C version of this thing does multiple things
in the same function: it loads the bigrams, it iterates through
the database, and it compares the things found in the database
to the prepared pattern.  It seems to me, therefore, that we're
better off with an instance that loads the bigrams, then closes
the database immediately.

Later, the client can ask for one of two iterators: one that either
returns each entry in sequence, or one that returns each entry in sequence
that matches the pattern passed in.
This commit is contained in:
Elf M. Sternberg 2022-11-26 16:49:25 -08:00
parent d13a76f08a
commit 550d4c1876
7 changed files with 123 additions and 2 deletions

3
.gitignore vendored
View File

@ -5,5 +5,6 @@
**/*.rs.bk
target
Cargo.lock
crates/squozen/docs/patprep/bench_patprep
crates/squozen/docs/patprep/rust_examples
.ccls-cache

View File

@ -14,4 +14,5 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
libc = "0.2.137"

View File

@ -11,6 +11,7 @@ path = "src/lib.rs"
[dependencies]
fnmatch-sys = "1.0.0"
libc = "0.2.137"
[[bin]]
name = "bench_patprep"

View File

@ -1,3 +1,5 @@
// pub use crate::codesquoze;
// pub mod squozen;
extern crate libc;
pub mod prepare_pattern;
pub mod squozen;

View File

@ -13,6 +13,13 @@ const GLOBSTARTS: &[u8] = &[b'?', b'*', b']'];
// only the content of the pattern, and no nulls at either end, relying instead
// on Rust's tracking the size of slices internally.
// One of the biggest changes between this and the original is that the original
// used "not pointing within the legal slice" as a sentinel for exceeding the
// bounds of the searchable space; since Rust doesn't allow that, we have to
// test ahead of time if the condition in which the original would have exceeded
// the legal search space is met, and short-circuit the return value at that
// point.
fn hunt<F>(name: &[u8], end: usize, alt: usize, comp: F) -> usize
where
F: Fn(&u8) -> bool,

View File

@ -0,0 +1,106 @@
use libc;
use std::fs::File;
use std::io::{Bufreader, Bytes};
use std::path::Path;
const PARITY: u8 = 0o200;
const RECORD_SEPARATOR: u8 = 30;
const OFFSET: u8 = 14;
const BIGRAMS: usize = BIGRAMS;
#[derive(Clone, Debug)]
pub struct Squozen {
bigrams: [char; BIGRAMS],
path: Path,
}
pub impl Squozen {
pub fn new(filename: Path) -> Result<Squozen, Error> {
let mut dbfile = File::open(filename)?;
let mut db = Bufreader::new(dbfile);
let mut bigrams: [char; BIGRAMS] = [0; BIGRAMS];
db.read_exact(&mut bigrams)?;
Ok(Squozen {
bigrams,
path: filename.clone(),
})
}
pub fn paths(&mut self) -> StoredPath {
StoredPath::new(&self);
}
/*
pub fn iter(&mut self) -> StoredPath {
FoundPath::new(&self);
}
*/
}
pub struct StoredPath<'a> {
source: Squozen,
path: [char; libc::PATH_MAX],
db: Bytes<Bufreader>,
ch: u8,
last: usize,
found: bool,
}
pub impl<'a> StoredPath {
pub fn new(squozen: Squozen) -> StoredPath {
let mut dbfile = File::open(&squozen.path)?;
let mut dbbuffer = Bufreader::new(dbfile);
dbbuffer.seek(BIGRAMS);
let mut db = dbbuffer.bytes();
let mut ch = db.next()?;
StoredPath {
squozen,
db,
ch,
path: [0; libc::PATH_MAX],
last: 0,
}
}
// Note that in either case, the file pointer will be pointing at the first
// valid character of the database, or it will be over.
pub fn getw(&mut self) -> Result<u16> {
let ch1 = self.db.next()?;
let ch2 = self.db.next()?;
Ok(u16::from_le_bytes(&[ch1, ch2]))
}
pub fn get_offset(&mut self) -> Result<usize> {
if self.ch == RECORD_SEPARATOR {
let offset = self.getw()?;
Ok(usize::from(offset))
} else {
Ok(usize::from(self.ch))
}
}
}
pub impl<'a> Iterator for StoredPath {
type Item = &[char; libc::PATH_MAX];
fn next(&mut self) -> Option<&Self::Item> {
let offset = self.get_offset();
let position = 0;
loop {
self.ch = self.db.next()?;
if self.ch <= RECORD_SEPARATOR {
break;
}
if self.ch < PARITY {
self.path[self.last + position] = ch;
position += 1;
} else {
let bg = self.ch & PARITY - 1;
self.path[self.last + position] = self.squozen.bigrams[bg * 2];
self.path[self.last + position + 1] = self.squozen.bigrams[bg * 2 + 1];
position += 2;
}
}
Some(&self.path)
}
}

3
docs/NOTES.md Normal file
View File

@ -0,0 +1,3 @@
Errors found:
- Unparsable/unusable search pattern