use comrak::nodes::{AstNode, NodeValue}; use comrak::{parse_document, Arena, ComrakOptions}; use lazy_static::lazy_static; use regex::bytes::Regex as BytesRegex; use regex::Regex; pub struct Finder(pub Vec); impl Finder { pub fn new() -> Self { Finder(Vec::new()) } fn iter_nodes<'a, F>(&mut self, node: &'a AstNode<'a>, f: &F) where F: Fn(&'a AstNode<'a>) -> Option>, { if let Some(mut v) = f(node) { self.0.append(&mut v); } for c in node.children() { self.iter_nodes(c, f); } } } /// Given a content block, return a list of all the page references found /// within the block. The references may need further massaging. pub(crate) fn find_links(document: &str) -> Vec { let arena = Arena::new(); let mut finder = Finder::new(); let root = parse_document(&arena, document, &ComrakOptions::default()); finder.iter_nodes(root, &|node| { lazy_static! { static ref RE_REFERENCES: BytesRegex = BytesRegex::new(r"(\[\[([^\]]+)\]\]|(\#[\w\-]+))").unwrap(); } match &node.data.borrow().value { &NodeValue::Text(ref text) => Some( RE_REFERENCES .captures_iter(text) .map(|t| String::from_utf8_lossy(&t.get(1).unwrap().as_bytes()).to_string()) .collect(), ), _ => None, } }); finder.0 } fn recase(title: &str) -> String { lazy_static! { static ref RE_PASS1: Regex = Regex::new(r"(?P.)(?P[A-Z][a-z]+)").unwrap(); static ref RE_PASS2: Regex = Regex::new(r"(?P[[:lower:]]|\d)(?P[[:upper:]])").unwrap(); static ref RE_PASS4: Regex = Regex::new(r"(?P[a-z])(?P\d)").unwrap(); static ref RE_PASS3: Regex = Regex::new(r"(_|-| )+").unwrap(); } // This should panic if misused, so... :-) let pass = title.to_string(); let pass = pass.strip_prefix("#").unwrap(); let pass = RE_PASS1.replace_all(&pass, "$s $n"); let pass = RE_PASS4.replace_all(&pass, "$s $n"); let pass = RE_PASS2.replace_all(&pass, "$s $n"); RE_PASS3.replace_all(&pass, " ").to_string() } fn build_page_titles(references: &Vec) -> Vec { references .iter() .map(|s| { let c = s.chars().nth(0); match c { Some('#') => recase(s), Some('[') => s.strip_prefix("[[").unwrap().strip_suffix("]]").unwrap().to_string(), Some(_) => s.clone(), _ => "".to_string(), } }) .collect() } #[cfg(test)] mod tests { use super::*; #[test] fn finds_expected() { let sample = r###" # Header - NotATest 1 - [[Test 2]] - #Test3 - #TestFourAndSo - #Test-six-is-six - # - [[]] But *[[Test Seven]]* isn't. And *#Test_Eight____is_Messed-up* And [[Test Bite Me]] is the worst. Right? [[ ]] "###; let res = build_page_titles(&find_links(sample)); let expected = vec![ "Test 2", "Test 3", "Test Four And So", "Test six is six", "Test Seven", "Test Eight is Messed up", "Test Bite Me", ]; assert!(res.iter().eq(expected.iter())); } }