Intermediate progress: Squozen
I realized that the C version of this thing does multiple things in the same function: it loads the bigrams, it iterates through the database, and it compares the things found in the database to the prepared pattern. It seems to me, therefore, that we're better off with an instance that loads the bigrams, then closes the database immediately. Later, the client can ask for one of two iterators: one that either returns each entry in sequence, or one that returns each entry in sequence that matches the pattern passed in.
This commit is contained in:
parent
d13a76f08a
commit
550d4c1876
|
@ -5,5 +5,6 @@
|
||||||
**/*.rs.bk
|
**/*.rs.bk
|
||||||
target
|
target
|
||||||
Cargo.lock
|
Cargo.lock
|
||||||
|
crates/squozen/docs/patprep/bench_patprep
|
||||||
|
crates/squozen/docs/patprep/rust_examples
|
||||||
.ccls-cache
|
.ccls-cache
|
||||||
|
|
|
@ -14,4 +14,5 @@ edition = "2021"
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
|
libc = "0.2.137"
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,7 @@ path = "src/lib.rs"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
fnmatch-sys = "1.0.0"
|
fnmatch-sys = "1.0.0"
|
||||||
|
libc = "0.2.137"
|
||||||
|
|
||||||
[[bin]]
|
[[bin]]
|
||||||
name = "bench_patprep"
|
name = "bench_patprep"
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
// pub use crate::codesquoze;
|
// pub use crate::codesquoze;
|
||||||
// pub mod squozen;
|
extern crate libc;
|
||||||
|
|
||||||
pub mod prepare_pattern;
|
pub mod prepare_pattern;
|
||||||
|
pub mod squozen;
|
||||||
|
|
|
@ -13,6 +13,13 @@ const GLOBSTARTS: &[u8] = &[b'?', b'*', b']'];
|
||||||
// only the content of the pattern, and no nulls at either end, relying instead
|
// only the content of the pattern, and no nulls at either end, relying instead
|
||||||
// on Rust's tracking the size of slices internally.
|
// on Rust's tracking the size of slices internally.
|
||||||
|
|
||||||
|
// One of the biggest changes between this and the original is that the original
|
||||||
|
// used "not pointing within the legal slice" as a sentinel for exceeding the
|
||||||
|
// bounds of the searchable space; since Rust doesn't allow that, we have to
|
||||||
|
// test ahead of time if the condition in which the original would have exceeded
|
||||||
|
// the legal search space is met, and short-circuit the return value at that
|
||||||
|
// point.
|
||||||
|
|
||||||
fn hunt<F>(name: &[u8], end: usize, alt: usize, comp: F) -> usize
|
fn hunt<F>(name: &[u8], end: usize, alt: usize, comp: F) -> usize
|
||||||
where
|
where
|
||||||
F: Fn(&u8) -> bool,
|
F: Fn(&u8) -> bool,
|
||||||
|
|
|
@ -0,0 +1,106 @@
|
||||||
|
use libc;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{Bufreader, Bytes};
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
const PARITY: u8 = 0o200;
|
||||||
|
const RECORD_SEPARATOR: u8 = 30;
|
||||||
|
const OFFSET: u8 = 14;
|
||||||
|
const BIGRAMS: usize = BIGRAMS;
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct Squozen {
|
||||||
|
bigrams: [char; BIGRAMS],
|
||||||
|
path: Path,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub impl Squozen {
|
||||||
|
pub fn new(filename: Path) -> Result<Squozen, Error> {
|
||||||
|
let mut dbfile = File::open(filename)?;
|
||||||
|
let mut db = Bufreader::new(dbfile);
|
||||||
|
let mut bigrams: [char; BIGRAMS] = [0; BIGRAMS];
|
||||||
|
db.read_exact(&mut bigrams)?;
|
||||||
|
Ok(Squozen {
|
||||||
|
bigrams,
|
||||||
|
path: filename.clone(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn paths(&mut self) -> StoredPath {
|
||||||
|
StoredPath::new(&self);
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
pub fn iter(&mut self) -> StoredPath {
|
||||||
|
FoundPath::new(&self);
|
||||||
|
}
|
||||||
|
*/
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct StoredPath<'a> {
|
||||||
|
source: Squozen,
|
||||||
|
path: [char; libc::PATH_MAX],
|
||||||
|
db: Bytes<Bufreader>,
|
||||||
|
ch: u8,
|
||||||
|
last: usize,
|
||||||
|
found: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
pub impl<'a> StoredPath {
|
||||||
|
pub fn new(squozen: Squozen) -> StoredPath {
|
||||||
|
let mut dbfile = File::open(&squozen.path)?;
|
||||||
|
let mut dbbuffer = Bufreader::new(dbfile);
|
||||||
|
dbbuffer.seek(BIGRAMS);
|
||||||
|
let mut db = dbbuffer.bytes();
|
||||||
|
let mut ch = db.next()?;
|
||||||
|
StoredPath {
|
||||||
|
squozen,
|
||||||
|
db,
|
||||||
|
ch,
|
||||||
|
path: [0; libc::PATH_MAX],
|
||||||
|
last: 0,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Note that in either case, the file pointer will be pointing at the first
|
||||||
|
// valid character of the database, or it will be over.
|
||||||
|
pub fn getw(&mut self) -> Result<u16> {
|
||||||
|
let ch1 = self.db.next()?;
|
||||||
|
let ch2 = self.db.next()?;
|
||||||
|
Ok(u16::from_le_bytes(&[ch1, ch2]))
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn get_offset(&mut self) -> Result<usize> {
|
||||||
|
if self.ch == RECORD_SEPARATOR {
|
||||||
|
let offset = self.getw()?;
|
||||||
|
Ok(usize::from(offset))
|
||||||
|
} else {
|
||||||
|
Ok(usize::from(self.ch))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub impl<'a> Iterator for StoredPath {
|
||||||
|
type Item = &[char; libc::PATH_MAX];
|
||||||
|
|
||||||
|
fn next(&mut self) -> Option<&Self::Item> {
|
||||||
|
let offset = self.get_offset();
|
||||||
|
let position = 0;
|
||||||
|
loop {
|
||||||
|
self.ch = self.db.next()?;
|
||||||
|
if self.ch <= RECORD_SEPARATOR {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if self.ch < PARITY {
|
||||||
|
self.path[self.last + position] = ch;
|
||||||
|
position += 1;
|
||||||
|
} else {
|
||||||
|
let bg = self.ch & PARITY - 1;
|
||||||
|
self.path[self.last + position] = self.squozen.bigrams[bg * 2];
|
||||||
|
self.path[self.last + position + 1] = self.squozen.bigrams[bg * 2 + 1];
|
||||||
|
position += 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Some(&self.path)
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,3 @@
|
||||||
|
Errors found:
|
||||||
|
|
||||||
|
- Unparsable/unusable search pattern
|
Loading…
Reference in New Issue