use comrak::nodes::{AstNode, NodeValue}; use comrak::{parse_document, Arena, ComrakOptions}; use lazy_static::lazy_static; use regex::bytes::Regex as BytesRegex; use regex::Regex; struct Finder(pub Vec); impl Finder { pub fn new() -> Self { Finder(Vec::new()) } fn iter_nodes<'a, F>(&mut self, node: &'a AstNode<'a>, f: &F) where F: Fn(&'a AstNode<'a>) -> Option>, { if let Some(mut v) = f(node) { self.0.append(&mut v); } for c in node.children() { self.iter_nodes(c, f); } } } pub(super) fn find_links(document: &str) -> Vec { let arena = Arena::new(); let mut finder = Finder::new(); let root = parse_document(&arena, document, &ComrakOptions::default()); finder.iter_nodes(root, &|node| { lazy_static! { static ref RE_REFERENCES: BytesRegex = BytesRegex::new(r"(\[\[([^\]]+)\]\]|(\#[:\w\-]+))").unwrap(); } match &node.data.borrow().value { NodeValue::Text(ref text) => Some( RE_REFERENCES .captures_iter(text) .filter_map(|t| t.get(1)) .map(|t| String::from_utf8_lossy(t.as_bytes()).to_string()) .filter(|s| !s.is_empty()) .collect(), ), _ => None, } }); finder.0 } // This function is for the camel and snake case handers. fn recase(title: &str) -> String { lazy_static! { // Take every word that has a pattern of a capital letter // followed by a lower case, and put a space between the // capital and anything that preceeds it. // TODO: Make Unicode aware. static ref RE_PASS1: Regex = Regex::new(r"(?P.)(?P[A-Z][a-z]+)").unwrap(); // Take every instance of a lower case letter or number, // followed by a capital letter, and put a space between them. // TODO: Make Unicode aware. [[:lower:]] is an ASCII-ism. static ref RE_PASS2: Regex = Regex::new(r"(?P[[:lower:]]|\d)(?P[[:upper:]])").unwrap(); // Take every instance of a word suffixed by a number and put // a space between them. // TODO: Make Unicode aware. [[:lower:]] is an ASCII-ism. static ref RE_PASS4: Regex = Regex::new(r"(?P[[:lower:]])(?P\d)").unwrap(); // Take every instance of the one-or-more-of the symbols listed, and // replace them with a space. This function is Unicode-irrelevant, // although there is a list of symbols in the backreference parser // that may disagree. // TODO: Examime backreference parser and determine if this is // sufficient. static ref RE_PASS3: Regex = Regex::new(r"(:|_|-| )+").unwrap(); } // This should panic if misused, so... :-) let pass = title.to_string(); let pass = pass.strip_prefix("#").unwrap(); let pass = RE_PASS1.replace_all(&pass, "$s $n"); let pass = RE_PASS4.replace_all(&pass, "$s $n"); let pass = RE_PASS2.replace_all(&pass, "$s $n"); RE_PASS3.replace_all(&pass, " ").trim().to_string() } pub(super) fn build_page_titles(references: &[String]) -> Vec { references .iter() .filter_map(|s| match s.chars().next() { Some('#') => Some(recase(s)), Some('[') => Some(s.strip_prefix("[[").unwrap().strip_suffix("]]").unwrap().to_string()), Some(_) => Some(s.clone()), _ => None, }) .filter(|s| !s.is_empty()) .collect() } #[cfg(test)] mod tests { use super::*; #[test] fn finds_expected() { let sample = r###" # Header - NotATest 1 - [[Test 2]] - #Test3 - #TestFourAndSo - #Test-six-is-six - #recipe:entree - # - #-_- - #--Prefixed - [[]] But *[[Test Seven]]* isn't. And *#Test_Eight____is_Messed-up* And [[Test Bite Me]] is the worst. Right? [[ ]] "###; let res = build_page_titles(&find_links(sample)); let expected = vec![ "Test 2", "Test 3", "Test Four And So", "Test six is six", "recipe entree", "Prefixed", "Test Seven", "Test Eight is Messed up", "Test Bite Me", ]; assert!(res.iter().eq(expected.iter()), "{:?}", res); } #[test] fn doesnt_crash_on_empty() { let sample = ""; let res = build_page_titles(&find_links(sample)); let expected: Vec = vec![]; assert!(res.iter().eq(expected.iter()), "{:?}", res); } }