Moved everything around so it's more project-y

Added the squozen patprep function, added unit tests to the
patprep `c` code, and ensured that the rust version works the
same way.  The only remaining code slowdown is that re-allocating
the Vec 50 million times turns out to be slower than re-using the
same slice of RAM over and over and over.
This commit is contained in:
Elf M. Sternberg 2022-11-11 08:31:22 -08:00
parent bf2b2715d4
commit 2eab17934c
19 changed files with 346 additions and 57 deletions

9
.gitignore vendored
View File

@ -1,2 +1,9 @@
/target
.#*
*~
*#
*.aux
**/*.rs.bk
target
Cargo.lock
.ccls-cache

View File

@ -1,17 +1,17 @@
[package]
name = "mlocate-rs"
version = "0.1.0"
name = "rlocate"
version = "0.0.1"
authors = ["Elf M. Sternberg <elf.sternberg@gmail.com>"]
edition = "2018"
license = "MPL-2.0+"
description = "Rust implementation of the Linux mlocate client, with library."
repository = "https://github.com/elfsternberg/mlocate-rs"
readme = "./README.md"
description = "Unix Locate/Updatedb utility"
license = "Apache-2.0 WITH LLVM-exception"
categories = ["coreutil"]
keywords = ["unix", "utility", "cli"]
repository = "https://git.elfsternberg.com/elf/rlocate"
readme = "README.md"
# default-run = "locate"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
globset = "0.4.8"
regex = "1.5.4"
clap = "2.33.3"
structview = "1.1.0"

View File

@ -25,8 +25,6 @@ Right now, none of this works. The _intended_ feature list is:
- Can read the following locatedb formats:
- MLOCATE
- LOCATE01
- LOCATE02
- Provides new locatedb formats:
- RLOCR01: Directory path prefixes are built by reference, making for
a much smaller database.

17
crates/mlocate/Cargo.toml Normal file
View File

@ -0,0 +1,17 @@
[package]
name = "mlocate-rs"
version = "0.1.0"
authors = ["Elf M. Sternberg <elf.sternberg@gmail.com>"]
edition = "2018"
license = "MPL-2.0+"
description = "Rust implementation of the Linux mlocate client, with library."
repository = "https://github.com/elfsternberg/mlocate-rs"
readme = "./README.md"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
globset = "0.4.8"
regex = "1.5.4"
clap = "2.33.3"
structview = "1.1.0"

View File

@ -3,4 +3,4 @@
* Implement a "write to rslocate01" feature.
* implement a "write to rzlocate01" feature, stealing wildly from
dictd.
* Read up on fanotify for Linux. See what's rustable.

4
crates/mlocate/pop.yaml Normal file
View File

@ -0,0 +1,4 @@
---
name: rlocate
description: A version of the locate toolkit, written in rust
keywords: rust, library, suite

50
crates/mlocate/src/lib.rs Normal file
View File

@ -0,0 +1,50 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
//! MLocate
//!
//! The readme has the full explanation, but the `locate` suite of
//! tools present in all Linux distributions is used to locate files
//! on your storage device. Rather than search the device directly,
//! `locate` scans a catalog file created during downtime.
//!
//! `MLocate` is the most popular implementation of the locate system,
//! but it has three annoying flaws:
//!
//! 1. The archive file isn't very compressed.
//! 2. The archive file is always an average of 12 hours out of date.
//! 3. The archive is accessible only through a command line program.
//!
//! This program intends to read one of two different formats, the
//! classic mlocate format, or a new format that exploits a few nifty
//! tricks to try and make the database file smaller and access
//! faster.
extern crate structview;
pub mod database;
pub mod mlocate_db;
use crate::database::LocateDb;
use crate::mlocate_db::MlHeader;
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::{BufRead, BufReader};
#[test]
fn can_read_header() -> Result<(), String> {
let db = File::open("/var/lib/mlocate/mlocate.db").expect("Unable to open file");
let mut reader = BufReader::new(db);
match reader.fill_buf() {
Ok(buffer) => {
assert!(MlHeader::is(buffer), "Could not read DB");
Ok(())
}
Err(_) => Err("The header could not be read".to_owned()),
}
}
}

17
crates/squozen/Cargo.toml Normal file
View File

@ -0,0 +1,17 @@
[package]
name = "squozen"
description = "Decompressor and search engine for the Squozen format (1983)"
version = "0.1.0"
edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[lib]
name = "squozen"
path = "src/lib.rs"
[dependencies]
fnmatch-sys = "1.0.0"
[[bin]]
name = "bench_patprep"
path = "bench/bench_patprep.rs"

View File

@ -0,0 +1,11 @@
use squozen::prepare_pattern::prepare_pattern;
const COUNT: usize = 5 * 1000 * 1000 * 100;
fn main() {
let mut end = COUNT;
while end > 0 {
let _g = prepare_pattern(b"/foo/bar/whatever[0-9]*");
end = end - 1;
}
}

View File

@ -0,0 +1,6 @@
bench:
gcc -o bench_patprep patprep.c bench_patprep.c
rust_examples:
gcc -o rust_examples patprep.c gen_rust_examples.c

View File

@ -0,0 +1,9 @@
#include "patprep.h"
const int count = 5 * 1000 * 1000 * 1000;
void main() {
for (int i = 0; i <= count; i++) {
patprep("/foo/bar/whatever[0-9]*");
}
}

View File

@ -0,0 +1,36 @@
#include "patprep.h"
#include <stdio.h>
// Below is a series of test cases that were not present in the original. The
// purpose of this function is to output the test cases that will go into the
// rust version of the project, to assert that they behave correctly.
char *cases[] = {"testing",
"test*",
"/foo/bar/whatever[0-9]",
"/foo/bar/whatever*[0-9]",
"/foo/bar/whatever[0-9]",
"/foo/bar/whatever[0-9]*",
"/foo/bar/*whatever[0-9]",
"fooz]",
NULL};
// Since patprep gives us the END of the array, we need to search for
// the beginning. And both ends are null terminated, because Unix.
// There is a ton of undefined behavior here, all of which is predicated
// on never sending patprep something it can't parse, or a zero-length
// string.
void main() {
int i;
char *c;
char *g;
for (i = 0, c = cases[0]; cases[i] != NULL; ++i, c = cases[i]) {
g = patprep(c);
while (*g != 0) {
g--;
}
g++;
printf("assert_eq!(prepare_pattern(b\"%s\"), b\"%s\");\n", c, g);
}
}

View File

@ -0,0 +1,74 @@
#include "patprep.h"
#include <stdio.h>
#include <string.h>
#include <strings.h>
// This is a copy of the patprep() function from the original 1983 version of
// locate.c. I have annotated it heavily in order to document, in my own mind at
// any rate, exactly what it does, step by step.
// Globfree is limited to 100 characters.
static char globfree[100];
char *patprep(name)
char *name;
{
register char *endmark, *p, *subp;
subp = globfree;
*subp++ = '\0';
// Go to the very end of the string passed in.
p = name + strlen(name) - 1;
/* starting from the END of the string, skip trailing metacharacters (and
[] ranges) */
for (; p >= name; p--)
// Index is mis-named; it returns a pointer to the first
// instance of the character '*p' in the content of the
// static string here. In this case, it's saying that
// if there is no metacharacter, break out?
if (index("*?", *p) == 0)
break;
if (p < name)
p = name;
// Skip past a range operator.
if (*p == ']')
for (p--; p >= name; p--)
if (*p == '[') {
p--;
break;
}
if (p < name)
p = name;
/*
* if pattern has only metacharacters, check every path (force '/'
* search)
*/
// We got to the start of the string. At least give it an anchoring '/',
// so the matcher has something it can makes sense of.
if ((p == name) && index("?*[]", *p) != 0)
*subp++ = '/';
else {
// Okay, from where we were, scan backwards until we find another
// metacharacter or the root of the search string, in which case we have
// a literal substring.
for (endmark = p; p >= name; p--)
if (index("]*?", *p) != 0)
break;
// From the first non-metacharacter found to the first metacharacter (or
// EOL), copy that substring into the 100-byte reserve.
for (++p; (p <= endmark) && subp < (globfree + sizeof(globfree));)
*subp++ = *p++;
}
// Give a null ending.
*subp = '\0';
// Return a pointer to the last byte of the pattern reserve.
return (--subp);
}

View File

@ -0,0 +1 @@
char *patprep(char *name);

View File

@ -0,0 +1,3 @@
// pub use crate::codesquoze;
// pub mod squozen;
pub mod prepare_pattern;

View File

@ -0,0 +1,92 @@
const GLOBCHARS: &[u8] = &[b'?', b'*', b'[', b']'];
const GLOBSTARTS: &[u8] = &[b'?', b'*', b']'];
// prepare_pattern
//
// This functions finds the first substring of characters, starting from the end
// of the search string, that does not contain glob-special characters. It
// returns a vector of those characters for comparison. The test cases have all
// been derived from tests performed on the original 1983 `patprep` function
// found in locate.c.
// Unlike the original database, we're going to assume that this slice contains
// only the content of the pattern, and no nulls at either end, relying instead
// on Rust's tracking the size of slices internally.
fn hunt<F>(name: &[u8], end: usize, alt: usize, comp: F) -> usize
where
F: Fn(&u8) -> bool,
{
let mut p = end;
while p > 0 {
if comp(&name[p]) {
return p;
}
p -= 1;
}
return alt;
}
pub fn prepare_pattern(name: &[u8]) -> Vec<u8> {
let mut eol = name.len();
if eol == 0 {
panic!("Library error - This function should never be called with an empty string.")
}
// After this point, eol always points to the index from where we want to
// stop, not to the character beyond that.
eol = hunt(name, eol - 1, 0, |&c| c != b'*' && c != b'?');
if name[eol] == b']' {
eol = hunt(&name, eol - 1, 0, |&c| c == b'[');
eol = if eol > 0 { eol - 1 } else { 0 }
}
if eol == 0 {
return if GLOBCHARS.contains(&name[0]) {
vec![b'/']
} else {
vec![name[0]]
};
}
let start = hunt(&name, eol, 0, |&c| GLOBSTARTS.contains(&c));
let start = if GLOBSTARTS.contains(&name[start]) {
start + 1
} else {
start
};
if start > eol {
vec![b'/']
} else {
name[start..eol + 1].to_vec()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_patterns() {
assert_eq!(prepare_pattern(b"testing"), b"testing");
assert_eq!(prepare_pattern(b"t"), b"t");
assert_eq!(prepare_pattern(b"test*"), b"test");
assert_eq!(prepare_pattern(b"test*"), b"test");
assert_eq!(
prepare_pattern(b"/foo/bar/whatever[0-9]"),
b"/foo/bar/whatever"
);
assert_eq!(prepare_pattern(b"/foo/bar/whatever*[0-9]"), b"/");
assert_eq!(
prepare_pattern(b"/foo/bar/whatever[0-9]"),
b"/foo/bar/whatever"
);
assert_eq!(
prepare_pattern(b"/foo/bar/whatever[0-9]*"),
b"/foo/bar/whatever"
);
assert_eq!(prepare_pattern(b"/foo/bar/*whatever[0-9]"), b"whatever");
assert_eq!(prepare_pattern(b"fooz]"), b"f");
}
}

View File

@ -1,50 +1,14 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
//! MLocate
//!
//! The readme has the full explanation, but the `locate` suite of
//! tools present in all Linux distributions is used to locate files
//! on your storage device. Rather than search the device directly,
//! `locate` scans a catalog file created during downtime.
//!
//! `MLocate` is the most popular implementation of the locate system,
//! but it has three annoying flaws:
//!
//! 1. The archive file isn't very compressed.
//! 2. The archive file is always an average of 12 hours out of date.
//! 3. The archive is accessible only through a command line program.
//!
//! This program intends to read one of two different formats, the
//! classic mlocate format, or a new format that exploits a few nifty
//! tricks to try and make the database file smaller and access
//! faster.
extern crate structview;
pub mod database;
pub mod mlocate_db;
use crate::database::LocateDb;
use crate::mlocate_db::MlHeader;
pub fn add(left: usize, right: usize) -> usize {
left + right
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::{BufRead, BufReader};
#[test]
fn can_read_header() -> Result<(), String> {
let db = File::open("/var/lib/mlocate/mlocate.db").expect("Unable to open file");
let mut reader = BufReader::new(db);
match reader.fill_buf() {
Ok(buffer) => {
assert!(MlHeader::is(buffer), "Could not read DB");
Ok(())
}
Err(_) => Err("The header could not be read".to_owned()),
}
fn it_works() {
let result = add(2, 2);
assert_eq!(result, 4);
}
}