FEAT: Reference parser is now working.

It's probably not the fastest thing in the world, but
it's going to be enough for now.
This commit is contained in:
Elf M. Sternberg 2020-10-13 17:23:24 -07:00
parent 1c0f3abd6c
commit 4e04bb47d5
3 changed files with 121 additions and 0 deletions

View File

@ -15,6 +15,7 @@ friendly_id = "0.3.0"
thiserror = "1.0.20"
derive_builder = "0.9.0"
lazy_static = "1.4.0"
comrak = "0.8.2"
regex = "1.3.9"
slug = "0.1.4"
tokio = { version = "0.2.22", features = ["rt-threaded", "blocking"] }

View File

@ -2,6 +2,7 @@ mod errors;
mod row_structs;
mod store;
mod structs;
mod reference_parser;
pub use crate::errors::NoteStoreError;
pub use crate::store::NoteStore;

View File

@ -0,0 +1,119 @@
use comrak::nodes::{AstNode, NodeValue};
use comrak::{parse_document, Arena, ComrakOptions};
use lazy_static::lazy_static;
use regex::bytes::Regex as BytesRegex;
use regex::Regex;
pub struct Finder(pub Vec<String>);
impl Finder {
pub fn new() -> Self {
Finder(Vec::new())
}
fn iter_nodes<'a, F>(&mut self, node: &'a AstNode<'a>, f: &F)
where
F: Fn(&'a AstNode<'a>) -> Option<Vec<String>>,
{
if let Some(mut v) = f(node) {
self.0.append(&mut v);
}
for c in node.children() {
self.iter_nodes(c, f);
}
}
}
/// Given a content block, return a list of all the page references found
/// within the block. The references may need further massaging.
pub(crate) fn find_links(document: &str) -> Vec<String> {
let arena = Arena::new();
let mut finder = Finder::new();
let root = parse_document(&arena, document, &ComrakOptions::default());
finder.iter_nodes(root, &|node| {
lazy_static! {
static ref RE_REFERENCES: BytesRegex = BytesRegex::new(r"(\[\[([^\]]+)\]\]|(\#[\w\-]+))").unwrap();
}
match &node.data.borrow().value {
&NodeValue::Text(ref text) => Some(
RE_REFERENCES
.captures_iter(text)
.map(|t| String::from_utf8_lossy(&t.get(1).unwrap().as_bytes()).to_string())
.collect(),
),
_ => None,
}
});
finder.0
}
fn recase(title: &str) -> String {
lazy_static! {
static ref RE_PASS1: Regex = Regex::new(r"(?P<s>.)(?P<n>[A-Z][a-z]+)").unwrap();
static ref RE_PASS2: Regex = Regex::new(r"(?P<s>[[:lower:]]|\d)(?P<n>[[:upper:]])").unwrap();
static ref RE_PASS4: Regex = Regex::new(r"(?P<s>[a-z])(?P<n>\d)").unwrap();
static ref RE_PASS3: Regex = Regex::new(r"(_|-| )+").unwrap();
}
// This should panic if misused, so... :-)
let pass = title.to_string();
let pass = pass.strip_prefix("#").unwrap();
let pass = RE_PASS1.replace_all(&pass, "$s $n");
let pass = RE_PASS4.replace_all(&pass, "$s $n");
let pass = RE_PASS2.replace_all(&pass, "$s $n");
RE_PASS3.replace_all(&pass, " ").to_string()
}
fn build_page_titles(references: &Vec<String>) -> Vec<String> {
references
.iter()
.map(|s| {
let c = s.chars().nth(0);
match c {
Some('#') => recase(s),
Some('[') => s.strip_prefix("[[").unwrap().strip_suffix("]]").unwrap().to_string(),
Some(_) => s.clone(),
_ => "".to_string(),
}
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn finds_expected() {
let sample = r###"
# Header
- NotATest 1
- [[Test 2]]
- #Test3
- #TestFourAndSo
- #Test-six-is-six
- #
- [[]]
But *[[Test Seven]]* isn't. And *#Test_Eight____is_Messed-up*
And [[Test Bite Me]] is the worst.
Right? [[
]]
"###;
let res = build_page_titles(&find_links(sample));
let expected = vec![
"Test 2",
"Test 3",
"Test Four And So",
"Test six is six",
"Test Seven",
"Test Eight is Messed up",
"Test Bite Me",
];
assert!(res.iter().eq(expected.iter()));
}
}