Moved everything around so it's more project-y
Added the squozen patprep function, added unit tests to the patprep `c` code, and ensured that the rust version works the same way. The only remaining code slowdown is that re-allocating the Vec 50 million times turns out to be slower than re-using the same slice of RAM over and over and over.
This commit is contained in:
parent
bf2b2715d4
commit
2eab17934c
|
@ -1,2 +1,9 @@
|
|||
/target
|
||||
.#*
|
||||
*~
|
||||
*#
|
||||
*.aux
|
||||
**/*.rs.bk
|
||||
target
|
||||
Cargo.lock
|
||||
|
||||
.ccls-cache
|
||||
|
|
22
Cargo.toml
22
Cargo.toml
|
@ -1,17 +1,17 @@
|
|||
[package]
|
||||
name = "mlocate-rs"
|
||||
version = "0.1.0"
|
||||
name = "rlocate"
|
||||
version = "0.0.1"
|
||||
authors = ["Elf M. Sternberg <elf.sternberg@gmail.com>"]
|
||||
edition = "2018"
|
||||
license = "MPL-2.0+"
|
||||
description = "Rust implementation of the Linux mlocate client, with library."
|
||||
repository = "https://github.com/elfsternberg/mlocate-rs"
|
||||
readme = "./README.md"
|
||||
description = "Unix Locate/Updatedb utility"
|
||||
license = "Apache-2.0 WITH LLVM-exception"
|
||||
categories = ["coreutil"]
|
||||
keywords = ["unix", "utility", "cli"]
|
||||
repository = "https://git.elfsternberg.com/elf/rlocate"
|
||||
readme = "README.md"
|
||||
# default-run = "locate"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
globset = "0.4.8"
|
||||
regex = "1.5.4"
|
||||
clap = "2.33.3"
|
||||
structview = "1.1.0"
|
||||
|
||||
|
|
|
@ -25,8 +25,6 @@ Right now, none of this works. The _intended_ feature list is:
|
|||
|
||||
- Can read the following locatedb formats:
|
||||
- MLOCATE
|
||||
- LOCATE01
|
||||
- LOCATE02
|
||||
- Provides new locatedb formats:
|
||||
- RLOCR01: Directory path prefixes are built by reference, making for
|
||||
a much smaller database.
|
||||
|
|
|
@ -0,0 +1,17 @@
|
|||
[package]
|
||||
name = "mlocate-rs"
|
||||
version = "0.1.0"
|
||||
authors = ["Elf M. Sternberg <elf.sternberg@gmail.com>"]
|
||||
edition = "2018"
|
||||
license = "MPL-2.0+"
|
||||
description = "Rust implementation of the Linux mlocate client, with library."
|
||||
repository = "https://github.com/elfsternberg/mlocate-rs"
|
||||
readme = "./README.md"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
globset = "0.4.8"
|
||||
regex = "1.5.4"
|
||||
clap = "2.33.3"
|
||||
structview = "1.1.0"
|
|
@ -3,4 +3,4 @@
|
|||
* Implement a "write to rslocate01" feature.
|
||||
* implement a "write to rzlocate01" feature, stealing wildly from
|
||||
dictd.
|
||||
|
||||
* Read up on fanotify for Linux. See what's rustable.
|
|
@ -0,0 +1,4 @@
|
|||
---
|
||||
name: rlocate
|
||||
description: A version of the locate toolkit, written in rust
|
||||
keywords: rust, library, suite
|
|
@ -0,0 +1,50 @@
|
|||
// This Source Code Form is subject to the terms of the Mozilla Public
|
||||
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
//! MLocate
|
||||
//!
|
||||
//! The readme has the full explanation, but the `locate` suite of
|
||||
//! tools present in all Linux distributions is used to locate files
|
||||
//! on your storage device. Rather than search the device directly,
|
||||
//! `locate` scans a catalog file created during downtime.
|
||||
//!
|
||||
//! `MLocate` is the most popular implementation of the locate system,
|
||||
//! but it has three annoying flaws:
|
||||
//!
|
||||
//! 1. The archive file isn't very compressed.
|
||||
//! 2. The archive file is always an average of 12 hours out of date.
|
||||
//! 3. The archive is accessible only through a command line program.
|
||||
//!
|
||||
//! This program intends to read one of two different formats, the
|
||||
//! classic mlocate format, or a new format that exploits a few nifty
|
||||
//! tricks to try and make the database file smaller and access
|
||||
//! faster.
|
||||
|
||||
extern crate structview;
|
||||
|
||||
pub mod database;
|
||||
pub mod mlocate_db;
|
||||
|
||||
use crate::database::LocateDb;
|
||||
use crate::mlocate_db::MlHeader;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
|
||||
#[test]
|
||||
fn can_read_header() -> Result<(), String> {
|
||||
let db = File::open("/var/lib/mlocate/mlocate.db").expect("Unable to open file");
|
||||
let mut reader = BufReader::new(db);
|
||||
match reader.fill_buf() {
|
||||
Ok(buffer) => {
|
||||
assert!(MlHeader::is(buffer), "Could not read DB");
|
||||
Ok(())
|
||||
}
|
||||
Err(_) => Err("The header could not be read".to_owned()),
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,17 @@
|
|||
[package]
|
||||
name = "squozen"
|
||||
description = "Decompressor and search engine for the Squozen format (1983)"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
[lib]
|
||||
name = "squozen"
|
||||
path = "src/lib.rs"
|
||||
|
||||
[dependencies]
|
||||
fnmatch-sys = "1.0.0"
|
||||
|
||||
[[bin]]
|
||||
name = "bench_patprep"
|
||||
path = "bench/bench_patprep.rs"
|
|
@ -0,0 +1,11 @@
|
|||
use squozen::prepare_pattern::prepare_pattern;
|
||||
|
||||
const COUNT: usize = 5 * 1000 * 1000 * 100;
|
||||
|
||||
fn main() {
|
||||
let mut end = COUNT;
|
||||
while end > 0 {
|
||||
let _g = prepare_pattern(b"/foo/bar/whatever[0-9]*");
|
||||
end = end - 1;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
|
||||
bench:
|
||||
gcc -o bench_patprep patprep.c bench_patprep.c
|
||||
|
||||
rust_examples:
|
||||
gcc -o rust_examples patprep.c gen_rust_examples.c
|
|
@ -0,0 +1,9 @@
|
|||
#include "patprep.h"
|
||||
|
||||
const int count = 5 * 1000 * 1000 * 1000;
|
||||
|
||||
void main() {
|
||||
for (int i = 0; i <= count; i++) {
|
||||
patprep("/foo/bar/whatever[0-9]*");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,36 @@
|
|||
#include "patprep.h"
|
||||
#include <stdio.h>
|
||||
|
||||
// Below is a series of test cases that were not present in the original. The
|
||||
// purpose of this function is to output the test cases that will go into the
|
||||
// rust version of the project, to assert that they behave correctly.
|
||||
|
||||
char *cases[] = {"testing",
|
||||
"test*",
|
||||
"/foo/bar/whatever[0-9]",
|
||||
"/foo/bar/whatever*[0-9]",
|
||||
"/foo/bar/whatever[0-9]",
|
||||
"/foo/bar/whatever[0-9]*",
|
||||
"/foo/bar/*whatever[0-9]",
|
||||
"fooz]",
|
||||
NULL};
|
||||
|
||||
// Since patprep gives us the END of the array, we need to search for
|
||||
// the beginning. And both ends are null terminated, because Unix.
|
||||
// There is a ton of undefined behavior here, all of which is predicated
|
||||
// on never sending patprep something it can't parse, or a zero-length
|
||||
// string.
|
||||
|
||||
void main() {
|
||||
int i;
|
||||
char *c;
|
||||
char *g;
|
||||
for (i = 0, c = cases[0]; cases[i] != NULL; ++i, c = cases[i]) {
|
||||
g = patprep(c);
|
||||
while (*g != 0) {
|
||||
g--;
|
||||
}
|
||||
g++;
|
||||
printf("assert_eq!(prepare_pattern(b\"%s\"), b\"%s\");\n", c, g);
|
||||
}
|
||||
}
|
|
@ -0,0 +1,74 @@
|
|||
#include "patprep.h"
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <strings.h>
|
||||
|
||||
// This is a copy of the patprep() function from the original 1983 version of
|
||||
// locate.c. I have annotated it heavily in order to document, in my own mind at
|
||||
// any rate, exactly what it does, step by step.
|
||||
|
||||
// Globfree is limited to 100 characters.
|
||||
static char globfree[100];
|
||||
|
||||
char *patprep(name)
|
||||
char *name;
|
||||
{
|
||||
register char *endmark, *p, *subp;
|
||||
|
||||
subp = globfree;
|
||||
*subp++ = '\0';
|
||||
|
||||
// Go to the very end of the string passed in.
|
||||
p = name + strlen(name) - 1;
|
||||
|
||||
/* starting from the END of the string, skip trailing metacharacters (and
|
||||
[] ranges) */
|
||||
for (; p >= name; p--)
|
||||
// Index is mis-named; it returns a pointer to the first
|
||||
// instance of the character '*p' in the content of the
|
||||
// static string here. In this case, it's saying that
|
||||
// if there is no metacharacter, break out?
|
||||
if (index("*?", *p) == 0)
|
||||
break;
|
||||
|
||||
if (p < name)
|
||||
p = name;
|
||||
|
||||
// Skip past a range operator.
|
||||
if (*p == ']')
|
||||
for (p--; p >= name; p--)
|
||||
if (*p == '[') {
|
||||
p--;
|
||||
break;
|
||||
}
|
||||
|
||||
if (p < name)
|
||||
p = name;
|
||||
|
||||
/*
|
||||
* if pattern has only metacharacters, check every path (force '/'
|
||||
* search)
|
||||
*/
|
||||
|
||||
// We got to the start of the string. At least give it an anchoring '/',
|
||||
// so the matcher has something it can makes sense of.
|
||||
if ((p == name) && index("?*[]", *p) != 0)
|
||||
*subp++ = '/';
|
||||
else {
|
||||
// Okay, from where we were, scan backwards until we find another
|
||||
// metacharacter or the root of the search string, in which case we have
|
||||
// a literal substring.
|
||||
for (endmark = p; p >= name; p--)
|
||||
if (index("]*?", *p) != 0)
|
||||
break;
|
||||
|
||||
// From the first non-metacharacter found to the first metacharacter (or
|
||||
// EOL), copy that substring into the 100-byte reserve.
|
||||
for (++p; (p <= endmark) && subp < (globfree + sizeof(globfree));)
|
||||
*subp++ = *p++;
|
||||
}
|
||||
// Give a null ending.
|
||||
*subp = '\0';
|
||||
// Return a pointer to the last byte of the pattern reserve.
|
||||
return (--subp);
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
char *patprep(char *name);
|
|
@ -0,0 +1,3 @@
|
|||
// pub use crate::codesquoze;
|
||||
// pub mod squozen;
|
||||
pub mod prepare_pattern;
|
|
@ -0,0 +1,92 @@
|
|||
const GLOBCHARS: &[u8] = &[b'?', b'*', b'[', b']'];
|
||||
const GLOBSTARTS: &[u8] = &[b'?', b'*', b']'];
|
||||
|
||||
// prepare_pattern
|
||||
//
|
||||
// This functions finds the first substring of characters, starting from the end
|
||||
// of the search string, that does not contain glob-special characters. It
|
||||
// returns a vector of those characters for comparison. The test cases have all
|
||||
// been derived from tests performed on the original 1983 `patprep` function
|
||||
// found in locate.c.
|
||||
|
||||
// Unlike the original database, we're going to assume that this slice contains
|
||||
// only the content of the pattern, and no nulls at either end, relying instead
|
||||
// on Rust's tracking the size of slices internally.
|
||||
|
||||
fn hunt<F>(name: &[u8], end: usize, alt: usize, comp: F) -> usize
|
||||
where
|
||||
F: Fn(&u8) -> bool,
|
||||
{
|
||||
let mut p = end;
|
||||
while p > 0 {
|
||||
if comp(&name[p]) {
|
||||
return p;
|
||||
}
|
||||
p -= 1;
|
||||
}
|
||||
return alt;
|
||||
}
|
||||
|
||||
pub fn prepare_pattern(name: &[u8]) -> Vec<u8> {
|
||||
let mut eol = name.len();
|
||||
if eol == 0 {
|
||||
panic!("Library error - This function should never be called with an empty string.")
|
||||
}
|
||||
|
||||
// After this point, eol always points to the index from where we want to
|
||||
// stop, not to the character beyond that.
|
||||
|
||||
eol = hunt(name, eol - 1, 0, |&c| c != b'*' && c != b'?');
|
||||
if name[eol] == b']' {
|
||||
eol = hunt(&name, eol - 1, 0, |&c| c == b'[');
|
||||
eol = if eol > 0 { eol - 1 } else { 0 }
|
||||
}
|
||||
|
||||
if eol == 0 {
|
||||
return if GLOBCHARS.contains(&name[0]) {
|
||||
vec![b'/']
|
||||
} else {
|
||||
vec![name[0]]
|
||||
};
|
||||
}
|
||||
|
||||
let start = hunt(&name, eol, 0, |&c| GLOBSTARTS.contains(&c));
|
||||
let start = if GLOBSTARTS.contains(&name[start]) {
|
||||
start + 1
|
||||
} else {
|
||||
start
|
||||
};
|
||||
if start > eol {
|
||||
vec![b'/']
|
||||
} else {
|
||||
name[start..eol + 1].to_vec()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_patterns() {
|
||||
assert_eq!(prepare_pattern(b"testing"), b"testing");
|
||||
assert_eq!(prepare_pattern(b"t"), b"t");
|
||||
assert_eq!(prepare_pattern(b"test*"), b"test");
|
||||
assert_eq!(prepare_pattern(b"test*"), b"test");
|
||||
assert_eq!(
|
||||
prepare_pattern(b"/foo/bar/whatever[0-9]"),
|
||||
b"/foo/bar/whatever"
|
||||
);
|
||||
assert_eq!(prepare_pattern(b"/foo/bar/whatever*[0-9]"), b"/");
|
||||
assert_eq!(
|
||||
prepare_pattern(b"/foo/bar/whatever[0-9]"),
|
||||
b"/foo/bar/whatever"
|
||||
);
|
||||
assert_eq!(
|
||||
prepare_pattern(b"/foo/bar/whatever[0-9]*"),
|
||||
b"/foo/bar/whatever"
|
||||
);
|
||||
assert_eq!(prepare_pattern(b"/foo/bar/*whatever[0-9]"), b"whatever");
|
||||
assert_eq!(prepare_pattern(b"fooz]"), b"f");
|
||||
}
|
||||
}
|
48
src/lib.rs
48
src/lib.rs
|
@ -1,50 +1,14 @@
|
|||
// This Source Code Form is subject to the terms of the Mozilla Public
|
||||
// License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
// file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
//! MLocate
|
||||
//!
|
||||
//! The readme has the full explanation, but the `locate` suite of
|
||||
//! tools present in all Linux distributions is used to locate files
|
||||
//! on your storage device. Rather than search the device directly,
|
||||
//! `locate` scans a catalog file created during downtime.
|
||||
//!
|
||||
//! `MLocate` is the most popular implementation of the locate system,
|
||||
//! but it has three annoying flaws:
|
||||
//!
|
||||
//! 1. The archive file isn't very compressed.
|
||||
//! 2. The archive file is always an average of 12 hours out of date.
|
||||
//! 3. The archive is accessible only through a command line program.
|
||||
//!
|
||||
//! This program intends to read one of two different formats, the
|
||||
//! classic mlocate format, or a new format that exploits a few nifty
|
||||
//! tricks to try and make the database file smaller and access
|
||||
//! faster.
|
||||
|
||||
extern crate structview;
|
||||
|
||||
pub mod database;
|
||||
pub mod mlocate_db;
|
||||
|
||||
use crate::database::LocateDb;
|
||||
use crate::mlocate_db::MlHeader;
|
||||
pub fn add(left: usize, right: usize) -> usize {
|
||||
left + right
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fs::File;
|
||||
use std::io::{BufRead, BufReader};
|
||||
|
||||
#[test]
|
||||
fn can_read_header() -> Result<(), String> {
|
||||
let db = File::open("/var/lib/mlocate/mlocate.db").expect("Unable to open file");
|
||||
let mut reader = BufReader::new(db);
|
||||
match reader.fill_buf() {
|
||||
Ok(buffer) => {
|
||||
assert!(MlHeader::is(buffer), "Could not read DB");
|
||||
Ok(())
|
||||
}
|
||||
Err(_) => Err("The header could not be read".to_owned()),
|
||||
}
|
||||
fn it_works() {
|
||||
let result = add(2, 2);
|
||||
assert_eq!(result, 4);
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue