notesmachine/server/nm-store/src/parser/references.rs

use comrak::nodes::{AstNode, NodeValue};
use comrak::{parse_document, Arena, ComrakOptions};
use lazy_static::lazy_static;
use regex::bytes::Regex as BytesRegex;
use regex::Regex;

struct Finder(pub Vec<String>);

impl Finder {
    pub fn new() -> Self {
        Finder(Vec::new())
    }

    fn iter_nodes<'a, F>(&mut self, node: &'a AstNode<'a>, f: &F)
    where
        F: Fn(&'a AstNode<'a>) -> Option<Vec<String>>,
    {
        if let Some(mut v) = f(node) {
            self.0.append(&mut v);
        }
        for c in node.children() {
            self.iter_nodes(c, f);
        }
    }
}

pub(super) fn find_links(document: &str) -> Vec<String> {
    let arena = Arena::new();
    let mut finder = Finder::new();
    let root = parse_document(&arena, document, &ComrakOptions::default());

    finder.iter_nodes(root, &|node| {
        lazy_static! {
            static ref RE_REFERENCES: BytesRegex = BytesRegex::new(r"(\[\[([^\]]+)\]\]|(\#[:\w\-]+))").unwrap();
        }

        match &node.data.borrow().value {
            NodeValue::Text(ref text) => Some(
                RE_REFERENCES
                    .captures_iter(text)
                    .filter_map(|t| t.get(1))
                    .map(|t| String::from_utf8_lossy(t.as_bytes()).to_string())
                    .filter(|s| !s.is_empty())
                    .collect(),
            ),
            _ => None,
        }
    });

    finder.0
}

// This function is for the camel and snake case handers.
fn recase(title: &str) -> String {
    lazy_static! {
        // Take every word that has a pattern of a capital letter
        // followed by a lower case, and put a space between the
        // capital and anything that preceeds it.

        // TODO: Make Unicode aware.
        static ref RE_PASS1: Regex = Regex::new(r"(?P<s>.)(?P<n>[A-Z][a-z]+)").unwrap();

        // Take every instance of a lower case letter or number,
        // followed by a capital letter, and put a space between them.

        // TODO: Make Unicode aware. [[:lower:]] is an ASCII-ism.
        static ref RE_PASS2: Regex = Regex::new(r"(?P<s>[[:lower:]]|\d)(?P<n>[[:upper:]])").unwrap();

        // Take every instance of a word suffixed by a number and put
        // a space between them.

        // TODO: Make Unicode aware. [[:lower:]] is an ASCII-ism.
        static ref RE_PASS4: Regex = Regex::new(r"(?P<s>[[:lower:]])(?P<n>\d)").unwrap();

        // Take every instance of the one-or-more-of the symbols listed, and
        // replace them with a space.  This function is Unicode-irrelevant,
        // although there is a list of symbols in the backreference parser
        // that may disagree.

        // TODO: Examime backreference parser and determine if this is
        // sufficient.
        static ref RE_PASS3: Regex = Regex::new(r"(:|_|-| )+").unwrap();
    }

    // This should panic if misused, so... :-)
    let pass = title.to_string();
    let pass = pass.strip_prefix("#").unwrap();
    let pass = RE_PASS1.replace_all(&pass, "$s $n");
    let pass = RE_PASS4.replace_all(&pass, "$s $n");
    let pass = RE_PASS2.replace_all(&pass, "$s $n");
    RE_PASS3.replace_all(&pass, " ").trim().to_string()
}

pub(super) fn build_page_titles(references: &[String]) -> Vec<String> {
    references
        .iter()
        .filter_map(|s| match s.chars().next() {
            Some('#') => Some(recase(s)),
            Some('[') => Some(s.strip_prefix("[[").unwrap().strip_suffix("]]").unwrap().to_string()),
            Some(_) => Some(s.clone()),
            _ => None,
        })
        .filter(|s| !s.is_empty())
        .collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn finds_expected() {
        let sample = r###"
# Header
- NotATest 1
- [[Test 2]]
- #Test3
- #TestFourAndSo
- #Test-six-is-six
- #recipe:entree
- #
- #-_-
- #--Prefixed
- [[]]

But *[[Test Seven]]* isn't.  And *#Test_Eight____is_Messed-up*
And [[Test Bite Me]] is the worst.
Right? [[
]]
"###;
        let res = build_page_titles(&find_links(sample));
        let expected = vec![
            "Test 2",
            "Test 3",
            "Test Four And So",
            "Test six is six",
            "recipe entree",
            "Prefixed",
            "Test Seven",
            "Test Eight is Messed up",
            "Test Bite Me",
        ];
        assert!(res.iter().eq(expected.iter()), "{:?}", res);
    }

    #[test]
    fn doesnt_crash_on_empty() {
        let sample = "";
        let res = build_page_titles(&find_links(sample));
        let expected: Vec<String> = vec![];
        assert!(res.iter().eq(expected.iter()), "{:?}", res);
    }
}