Moved everything around so it's more project-y
Added the squozen patprep function, added unit tests to the patprep `c` code, and ensured that the rust version works the same way. The only remaining code slowdown is that re-allocating the Vec 50 million times turns out to be slower than re-using the same slice of RAM over and over and over.
This commit is contained in:
parent
bf2b2715d4
commit
2eab17934c
|
@ -1,2 +1,9 @@
|
||||||
/target
|
.#*
|
||||||
|
*~
|
||||||
|
*#
|
||||||
|
*.aux
|
||||||
|
**/*.rs.bk
|
||||||
|
target
|
||||||
Cargo.lock
|
Cargo.lock
|
||||||
|
|
||||||
|
.ccls-cache
|
||||||
|
|
22
Cargo.toml
22
Cargo.toml
|
@ -1,17 +1,17 @@
|
||||||
[package]
|
[package]
|
||||||
name = "mlocate-rs"
|
name = "rlocate"
|
||||||
version = "0.1.0"
|
version = "0.0.1"
|
||||||
authors = ["Elf M. Sternberg <elf.sternberg@gmail.com>"]
|
authors = ["Elf M. Sternberg <elf.sternberg@gmail.com>"]
|
||||||
edition = "2018"
|
description = "Unix Locate/Updatedb utility"
|
||||||
license = "MPL-2.0+"
|
license = "Apache-2.0 WITH LLVM-exception"
|
||||||
description = "Rust implementation of the Linux mlocate client, with library."
|
categories = ["coreutil"]
|
||||||
repository = "https://github.com/elfsternberg/mlocate-rs"
|
keywords = ["unix", "utility", "cli"]
|
||||||
readme = "./README.md"
|
repository = "https://git.elfsternberg.com/elf/rlocate"
|
||||||
|
readme = "README.md"
|
||||||
|
# default-run = "locate"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
globset = "0.4.8"
|
|
||||||
regex = "1.5.4"
|
|
||||||
clap = "2.33.3"
|
|
||||||
structview = "1.1.0"
|
|
||||||
|
|
|
@ -25,8 +25,6 @@ Right now, none of this works. The _intended_ feature list is:
|
||||||
|
|
||||||
- Can read the following locatedb formats:
|
- Can read the following locatedb formats:
|
||||||
- MLOCATE
|
- MLOCATE
|
||||||
- LOCATE01
|
|
||||||
- LOCATE02
|
|
||||||
- Provides new locatedb formats:
|
- Provides new locatedb formats:
|
||||||
- RLOCR01: Directory path prefixes are built by reference, making for
|
- RLOCR01: Directory path prefixes are built by reference, making for
|
||||||
a much smaller database.
|
a much smaller database.
|
||||||
|
|
|
@ -0,0 +1,17 @@
|
||||||
|
[package]
|
||||||
|
name = "mlocate-rs"
|
||||||
|
version = "0.1.0"
|
||||||
|
authors = ["Elf M. Sternberg <elf.sternberg@gmail.com>"]
|
||||||
|
edition = "2018"
|
||||||
|
license = "MPL-2.0+"
|
||||||
|
description = "Rust implementation of the Linux mlocate client, with library."
|
||||||
|
repository = "https://github.com/elfsternberg/mlocate-rs"
|
||||||
|
readme = "./README.md"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
globset = "0.4.8"
|
||||||
|
regex = "1.5.4"
|
||||||
|
clap = "2.33.3"
|
||||||
|
structview = "1.1.0"
|
|
@ -3,4 +3,4 @@
|
||||||
* Implement a "write to rslocate01" feature.
|
* Implement a "write to rslocate01" feature.
|
||||||
* implement a "write to rzlocate01" feature, stealing wildly from
|
* implement a "write to rzlocate01" feature, stealing wildly from
|
||||||
dictd.
|
dictd.
|
||||||
|
* Read up on fanotify for Linux. See what's rustable.
|
|
@ -0,0 +1,4 @@
|
||||||
|
---
|
||||||
|
name: rlocate
|
||||||
|
description: A version of the locate toolkit, written in rust
|
||||||
|
keywords: rust, library, suite
|
|
@ -0,0 +1,50 @@
|
||||||
|
// This Source Code Form is subject to the terms of the Mozilla Public
|
||||||
|
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||||
|
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||||
|
|
||||||
|
//! MLocate
|
||||||
|
//!
|
||||||
|
//! The readme has the full explanation, but the `locate` suite of
|
||||||
|
//! tools present in all Linux distributions is used to locate files
|
||||||
|
//! on your storage device. Rather than search the device directly,
|
||||||
|
//! `locate` scans a catalog file created during downtime.
|
||||||
|
//!
|
||||||
|
//! `MLocate` is the most popular implementation of the locate system,
|
||||||
|
//! but it has three annoying flaws:
|
||||||
|
//!
|
||||||
|
//! 1. The archive file isn't very compressed.
|
||||||
|
//! 2. The archive file is always an average of 12 hours out of date.
|
||||||
|
//! 3. The archive is accessible only through a command line program.
|
||||||
|
//!
|
||||||
|
//! This program intends to read one of two different formats, the
|
||||||
|
//! classic mlocate format, or a new format that exploits a few nifty
|
||||||
|
//! tricks to try and make the database file smaller and access
|
||||||
|
//! faster.
|
||||||
|
|
||||||
|
extern crate structview;
|
||||||
|
|
||||||
|
pub mod database;
|
||||||
|
pub mod mlocate_db;
|
||||||
|
|
||||||
|
use crate::database::LocateDb;
|
||||||
|
use crate::mlocate_db::MlHeader;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use std::fs::File;
|
||||||
|
use std::io::{BufRead, BufReader};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn can_read_header() -> Result<(), String> {
|
||||||
|
let db = File::open("/var/lib/mlocate/mlocate.db").expect("Unable to open file");
|
||||||
|
let mut reader = BufReader::new(db);
|
||||||
|
match reader.fill_buf() {
|
||||||
|
Ok(buffer) => {
|
||||||
|
assert!(MlHeader::is(buffer), "Could not read DB");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
Err(_) => Err("The header could not be read".to_owned()),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,17 @@
|
||||||
|
[package]
|
||||||
|
name = "squozen"
|
||||||
|
description = "Decompressor and search engine for the Squozen format (1983)"
|
||||||
|
version = "0.1.0"
|
||||||
|
edition = "2021"
|
||||||
|
|
||||||
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
[lib]
|
||||||
|
name = "squozen"
|
||||||
|
path = "src/lib.rs"
|
||||||
|
|
||||||
|
[dependencies]
|
||||||
|
fnmatch-sys = "1.0.0"
|
||||||
|
|
||||||
|
[[bin]]
|
||||||
|
name = "bench_patprep"
|
||||||
|
path = "bench/bench_patprep.rs"
|
|
@ -0,0 +1,11 @@
|
||||||
|
use squozen::prepare_pattern::prepare_pattern;
|
||||||
|
|
||||||
|
const COUNT: usize = 5 * 1000 * 1000 * 100;
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
let mut end = COUNT;
|
||||||
|
while end > 0 {
|
||||||
|
let _g = prepare_pattern(b"/foo/bar/whatever[0-9]*");
|
||||||
|
end = end - 1;
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,6 @@
|
||||||
|
|
||||||
|
bench:
|
||||||
|
gcc -o bench_patprep patprep.c bench_patprep.c
|
||||||
|
|
||||||
|
rust_examples:
|
||||||
|
gcc -o rust_examples patprep.c gen_rust_examples.c
|
|
@ -0,0 +1,9 @@
|
||||||
|
#include "patprep.h"
|
||||||
|
|
||||||
|
const int count = 5 * 1000 * 1000 * 1000;
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
for (int i = 0; i <= count; i++) {
|
||||||
|
patprep("/foo/bar/whatever[0-9]*");
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,36 @@
|
||||||
|
#include "patprep.h"
|
||||||
|
#include <stdio.h>
|
||||||
|
|
||||||
|
// Below is a series of test cases that were not present in the original. The
|
||||||
|
// purpose of this function is to output the test cases that will go into the
|
||||||
|
// rust version of the project, to assert that they behave correctly.
|
||||||
|
|
||||||
|
char *cases[] = {"testing",
|
||||||
|
"test*",
|
||||||
|
"/foo/bar/whatever[0-9]",
|
||||||
|
"/foo/bar/whatever*[0-9]",
|
||||||
|
"/foo/bar/whatever[0-9]",
|
||||||
|
"/foo/bar/whatever[0-9]*",
|
||||||
|
"/foo/bar/*whatever[0-9]",
|
||||||
|
"fooz]",
|
||||||
|
NULL};
|
||||||
|
|
||||||
|
// Since patprep gives us the END of the array, we need to search for
|
||||||
|
// the beginning. And both ends are null terminated, because Unix.
|
||||||
|
// There is a ton of undefined behavior here, all of which is predicated
|
||||||
|
// on never sending patprep something it can't parse, or a zero-length
|
||||||
|
// string.
|
||||||
|
|
||||||
|
void main() {
|
||||||
|
int i;
|
||||||
|
char *c;
|
||||||
|
char *g;
|
||||||
|
for (i = 0, c = cases[0]; cases[i] != NULL; ++i, c = cases[i]) {
|
||||||
|
g = patprep(c);
|
||||||
|
while (*g != 0) {
|
||||||
|
g--;
|
||||||
|
}
|
||||||
|
g++;
|
||||||
|
printf("assert_eq!(prepare_pattern(b\"%s\"), b\"%s\");\n", c, g);
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,74 @@
|
||||||
|
#include "patprep.h"
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <string.h>
|
||||||
|
#include <strings.h>
|
||||||
|
|
||||||
|
// This is a copy of the patprep() function from the original 1983 version of
|
||||||
|
// locate.c. I have annotated it heavily in order to document, in my own mind at
|
||||||
|
// any rate, exactly what it does, step by step.
|
||||||
|
|
||||||
|
// Globfree is limited to 100 characters.
|
||||||
|
static char globfree[100];
|
||||||
|
|
||||||
|
char *patprep(name)
|
||||||
|
char *name;
|
||||||
|
{
|
||||||
|
register char *endmark, *p, *subp;
|
||||||
|
|
||||||
|
subp = globfree;
|
||||||
|
*subp++ = '\0';
|
||||||
|
|
||||||
|
// Go to the very end of the string passed in.
|
||||||
|
p = name + strlen(name) - 1;
|
||||||
|
|
||||||
|
/* starting from the END of the string, skip trailing metacharacters (and
|
||||||
|
[] ranges) */
|
||||||
|
for (; p >= name; p--)
|
||||||
|
// Index is mis-named; it returns a pointer to the first
|
||||||
|
// instance of the character '*p' in the content of the
|
||||||
|
// static string here. In this case, it's saying that
|
||||||
|
// if there is no metacharacter, break out?
|
||||||
|
if (index("*?", *p) == 0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
if (p < name)
|
||||||
|
p = name;
|
||||||
|
|
||||||
|
// Skip past a range operator.
|
||||||
|
if (*p == ']')
|
||||||
|
for (p--; p >= name; p--)
|
||||||
|
if (*p == '[') {
|
||||||
|
p--;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (p < name)
|
||||||
|
p = name;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* if pattern has only metacharacters, check every path (force '/'
|
||||||
|
* search)
|
||||||
|
*/
|
||||||
|
|
||||||
|
// We got to the start of the string. At least give it an anchoring '/',
|
||||||
|
// so the matcher has something it can makes sense of.
|
||||||
|
if ((p == name) && index("?*[]", *p) != 0)
|
||||||
|
*subp++ = '/';
|
||||||
|
else {
|
||||||
|
// Okay, from where we were, scan backwards until we find another
|
||||||
|
// metacharacter or the root of the search string, in which case we have
|
||||||
|
// a literal substring.
|
||||||
|
for (endmark = p; p >= name; p--)
|
||||||
|
if (index("]*?", *p) != 0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
// From the first non-metacharacter found to the first metacharacter (or
|
||||||
|
// EOL), copy that substring into the 100-byte reserve.
|
||||||
|
for (++p; (p <= endmark) && subp < (globfree + sizeof(globfree));)
|
||||||
|
*subp++ = *p++;
|
||||||
|
}
|
||||||
|
// Give a null ending.
|
||||||
|
*subp = '\0';
|
||||||
|
// Return a pointer to the last byte of the pattern reserve.
|
||||||
|
return (--subp);
|
||||||
|
}
|
|
@ -0,0 +1 @@
|
||||||
|
char *patprep(char *name);
|
|
@ -0,0 +1,3 @@
|
||||||
|
// pub use crate::codesquoze;
|
||||||
|
// pub mod squozen;
|
||||||
|
pub mod prepare_pattern;
|
|
@ -0,0 +1,92 @@
|
||||||
|
const GLOBCHARS: &[u8] = &[b'?', b'*', b'[', b']'];
|
||||||
|
const GLOBSTARTS: &[u8] = &[b'?', b'*', b']'];
|
||||||
|
|
||||||
|
// prepare_pattern
|
||||||
|
//
|
||||||
|
// This functions finds the first substring of characters, starting from the end
|
||||||
|
// of the search string, that does not contain glob-special characters. It
|
||||||
|
// returns a vector of those characters for comparison. The test cases have all
|
||||||
|
// been derived from tests performed on the original 1983 `patprep` function
|
||||||
|
// found in locate.c.
|
||||||
|
|
||||||
|
// Unlike the original database, we're going to assume that this slice contains
|
||||||
|
// only the content of the pattern, and no nulls at either end, relying instead
|
||||||
|
// on Rust's tracking the size of slices internally.
|
||||||
|
|
||||||
|
fn hunt<F>(name: &[u8], end: usize, alt: usize, comp: F) -> usize
|
||||||
|
where
|
||||||
|
F: Fn(&u8) -> bool,
|
||||||
|
{
|
||||||
|
let mut p = end;
|
||||||
|
while p > 0 {
|
||||||
|
if comp(&name[p]) {
|
||||||
|
return p;
|
||||||
|
}
|
||||||
|
p -= 1;
|
||||||
|
}
|
||||||
|
return alt;
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn prepare_pattern(name: &[u8]) -> Vec<u8> {
|
||||||
|
let mut eol = name.len();
|
||||||
|
if eol == 0 {
|
||||||
|
panic!("Library error - This function should never be called with an empty string.")
|
||||||
|
}
|
||||||
|
|
||||||
|
// After this point, eol always points to the index from where we want to
|
||||||
|
// stop, not to the character beyond that.
|
||||||
|
|
||||||
|
eol = hunt(name, eol - 1, 0, |&c| c != b'*' && c != b'?');
|
||||||
|
if name[eol] == b']' {
|
||||||
|
eol = hunt(&name, eol - 1, 0, |&c| c == b'[');
|
||||||
|
eol = if eol > 0 { eol - 1 } else { 0 }
|
||||||
|
}
|
||||||
|
|
||||||
|
if eol == 0 {
|
||||||
|
return if GLOBCHARS.contains(&name[0]) {
|
||||||
|
vec![b'/']
|
||||||
|
} else {
|
||||||
|
vec![name[0]]
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
let start = hunt(&name, eol, 0, |&c| GLOBSTARTS.contains(&c));
|
||||||
|
let start = if GLOBSTARTS.contains(&name[start]) {
|
||||||
|
start + 1
|
||||||
|
} else {
|
||||||
|
start
|
||||||
|
};
|
||||||
|
if start > eol {
|
||||||
|
vec![b'/']
|
||||||
|
} else {
|
||||||
|
name[start..eol + 1].to_vec()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_patterns() {
|
||||||
|
assert_eq!(prepare_pattern(b"testing"), b"testing");
|
||||||
|
assert_eq!(prepare_pattern(b"t"), b"t");
|
||||||
|
assert_eq!(prepare_pattern(b"test*"), b"test");
|
||||||
|
assert_eq!(prepare_pattern(b"test*"), b"test");
|
||||||
|
assert_eq!(
|
||||||
|
prepare_pattern(b"/foo/bar/whatever[0-9]"),
|
||||||
|
b"/foo/bar/whatever"
|
||||||
|
);
|
||||||
|
assert_eq!(prepare_pattern(b"/foo/bar/whatever*[0-9]"), b"/");
|
||||||
|
assert_eq!(
|
||||||
|
prepare_pattern(b"/foo/bar/whatever[0-9]"),
|
||||||
|
b"/foo/bar/whatever"
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
prepare_pattern(b"/foo/bar/whatever[0-9]*"),
|
||||||
|
b"/foo/bar/whatever"
|
||||||
|
);
|
||||||
|
assert_eq!(prepare_pattern(b"/foo/bar/*whatever[0-9]"), b"whatever");
|
||||||
|
assert_eq!(prepare_pattern(b"fooz]"), b"f");
|
||||||
|
}
|
||||||
|
}
|
48
src/lib.rs
48
src/lib.rs
|
@ -1,50 +1,14 @@
|
||||||
// This Source Code Form is subject to the terms of the Mozilla Public
|
pub fn add(left: usize, right: usize) -> usize {
|
||||||
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
left + right
|
||||||
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
}
|
||||||
|
|
||||||
//! MLocate
|
|
||||||
//!
|
|
||||||
//! The readme has the full explanation, but the `locate` suite of
|
|
||||||
//! tools present in all Linux distributions is used to locate files
|
|
||||||
//! on your storage device. Rather than search the device directly,
|
|
||||||
//! `locate` scans a catalog file created during downtime.
|
|
||||||
//!
|
|
||||||
//! `MLocate` is the most popular implementation of the locate system,
|
|
||||||
//! but it has three annoying flaws:
|
|
||||||
//!
|
|
||||||
//! 1. The archive file isn't very compressed.
|
|
||||||
//! 2. The archive file is always an average of 12 hours out of date.
|
|
||||||
//! 3. The archive is accessible only through a command line program.
|
|
||||||
//!
|
|
||||||
//! This program intends to read one of two different formats, the
|
|
||||||
//! classic mlocate format, or a new format that exploits a few nifty
|
|
||||||
//! tricks to try and make the database file smaller and access
|
|
||||||
//! faster.
|
|
||||||
|
|
||||||
extern crate structview;
|
|
||||||
|
|
||||||
pub mod database;
|
|
||||||
pub mod mlocate_db;
|
|
||||||
|
|
||||||
use crate::database::LocateDb;
|
|
||||||
use crate::mlocate_db::MlHeader;
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use std::fs::File;
|
|
||||||
use std::io::{BufRead, BufReader};
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn can_read_header() -> Result<(), String> {
|
fn it_works() {
|
||||||
let db = File::open("/var/lib/mlocate/mlocate.db").expect("Unable to open file");
|
let result = add(2, 2);
|
||||||
let mut reader = BufReader::new(db);
|
assert_eq!(result, 4);
|
||||||
match reader.fill_buf() {
|
|
||||||
Ok(buffer) => {
|
|
||||||
assert!(MlHeader::is(buffer), "Could not read DB");
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
Err(_) => Err("The header could not be read".to_owned()),
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue