notesmachine/server/nm-store/src/parser/references.rs

154 lines
4.5 KiB
Rust

use comrak::nodes::{AstNode, NodeValue};
use comrak::{parse_document, Arena, ComrakOptions};
use lazy_static::lazy_static;
use regex::bytes::Regex as BytesRegex;
use regex::Regex;
struct Finder(pub Vec<String>);
impl Finder {
pub fn new() -> Self {
Finder(Vec::new())
}
fn iter_nodes<'a, F>(&mut self, node: &'a AstNode<'a>, f: &F)
where
F: Fn(&'a AstNode<'a>) -> Option<Vec<String>>,
{
if let Some(mut v) = f(node) {
self.0.append(&mut v);
}
for c in node.children() {
self.iter_nodes(c, f);
}
}
}
pub(super) fn find_links(document: &str) -> Vec<String> {
let arena = Arena::new();
let mut finder = Finder::new();
let root = parse_document(&arena, document, &ComrakOptions::default());
finder.iter_nodes(root, &|node| {
lazy_static! {
static ref RE_REFERENCES: BytesRegex = BytesRegex::new(r"(\[\[([^\]]+)\]\]|(\#[:\w\-]+))").unwrap();
}
match &node.data.borrow().value {
NodeValue::Text(ref text) => Some(
RE_REFERENCES
.captures_iter(text)
.filter_map(|t| t.get(1))
.map(|t| String::from_utf8_lossy(t.as_bytes()).to_string())
.filter(|s| !s.is_empty())
.collect(),
),
_ => None,
}
});
finder.0
}
// This function is for the camel and snake case handers.
fn recase(title: &str) -> String {
lazy_static! {
// Take every word that has a pattern of a capital letter
// followed by a lower case, and put a space between the
// capital and anything that preceeds it.
// TODO: Make Unicode aware.
static ref RE_PASS1: Regex = Regex::new(r"(?P<s>.)(?P<n>[A-Z][a-z]+)").unwrap();
// Take every instance of a lower case letter or number,
// followed by a capital letter, and put a space between them.
// TODO: Make Unicode aware. [[:lower:]] is an ASCII-ism.
static ref RE_PASS2: Regex = Regex::new(r"(?P<s>[[:lower:]]|\d)(?P<n>[[:upper:]])").unwrap();
// Take every instance of a word suffixed by a number and put
// a space between them.
// TODO: Make Unicode aware. [[:lower:]] is an ASCII-ism.
static ref RE_PASS4: Regex = Regex::new(r"(?P<s>[[:lower:]])(?P<n>\d)").unwrap();
// Take every instance of the one-or-more-of the symbols listed, and
// replace them with a space. This function is Unicode-irrelevant,
// although there is a list of symbols in the backreference parser
// that may disagree.
// TODO: Examime backreference parser and determine if this is
// sufficient.
static ref RE_PASS3: Regex = Regex::new(r"(:|_|-| )+").unwrap();
}
// This should panic if misused, so... :-)
let pass = title.to_string();
let pass = pass.strip_prefix("#").unwrap();
let pass = RE_PASS1.replace_all(&pass, "$s $n");
let pass = RE_PASS4.replace_all(&pass, "$s $n");
let pass = RE_PASS2.replace_all(&pass, "$s $n");
RE_PASS3.replace_all(&pass, " ").trim().to_string()
}
pub(super) fn build_page_titles(references: &[String]) -> Vec<String> {
references
.iter()
.filter_map(|s| match s.chars().next() {
Some('#') => Some(recase(s)),
Some('[') => Some(s.strip_prefix("[[").unwrap().strip_suffix("]]").unwrap().to_string()),
Some(_) => Some(s.clone()),
_ => None,
})
.filter(|s| !s.is_empty())
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn finds_expected() {
let sample = r###"
# Header
- NotATest 1
- [[Test 2]]
- #Test3
- #TestFourAndSo
- #Test-six-is-six
- #recipe:entree
- #
- #-_-
- #--Prefixed
- [[]]
But *[[Test Seven]]* isn't. And *#Test_Eight____is_Messed-up*
And [[Test Bite Me]] is the worst.
Right? [[
]]
"###;
let res = build_page_titles(&find_links(sample));
let expected = vec![
"Test 2",
"Test 3",
"Test Four And So",
"Test six is six",
"recipe entree",
"Prefixed",
"Test Seven",
"Test Eight is Messed up",
"Test Bite Me",
];
assert!(res.iter().eq(expected.iter()), "{:?}", res);
}
#[test]
fn doesnt_crash_on_empty() {
let sample = "";
let res = build_page_titles(&find_links(sample));
let expected: Vec<String> = vec![];
assert!(res.iter().eq(expected.iter()), "{:?}", res);
}
}