diff --git a/.gitignore b/.gitignore index 79ec19e..ef94558 100644 --- a/.gitignore +++ b/.gitignore @@ -13,7 +13,6 @@ build eggs parts -bin var sdist develop-eggs diff --git a/bin/extract-delicious.hy b/bin/extract-delicious.hy new file mode 100644 index 0000000..210f521 --- /dev/null +++ b/bin/extract-delicious.hy @@ -0,0 +1,60 @@ +#!/usr/local/bin/hy + +(def *version* "0.0.2") + +(import os re sys html2text + requests + [slugify [slugify]] + [datetime [datetime]] + [bs4 [BeautifulSoup]] + [xml.etree.ElementTree :as ET]) + +(def search-date (re.compile "saved by \S+ on (.*)")) + +(defn extract-date [infoblock] + (let [datetext (.group (.search search-date infoblock) 1) + dateparse (.strptime datetime datetext "%B %d, %Y")] + (.strftime dateparse "%Y-%m-%d %a 00:00"))) + +(defn extract-tags [tags] + (map (fn [li] (. li text)) (.find-all tags "li"))) + +(defn get-details [article] + (let [anchor (. (.find article "h3") a) + title (. anchor text) + info (.find article "div" :class "articleInfoPan") + url (-> info (. p) (. a) (. text)) + created-date (extract-date (. info text)) + desc (.find article "div" :class "thumbTBriefTxt") + rawtags (list (extract-tags desc)) + comment (.join " " (map (fn [i] (. i text)) (.find-all desc "p"))) + tags (if (> (len rawtags) 0) (+ ":" (.join ":" rawtags) ":") "")] + (print (.format "** [[{}][{}] {}" url title tags)) + (print ":PROPERTIES:") + (print (.format ":created: [{}]" created-date)) + (print ":END") + (print "") + (if (> (len comment) 0) + (do (print comment) + (print ""))))) + +(defn process-page [html] + (let [soup (BeautifulSoup html "lxml") + articles (.find-all soup "div" :class "articleThumbBlockOuter")] + (for [article articles] (get-details article)) + (let [nexturl (.find soup "a" {"aria-label" "Next"})] + (if nexturl + (+ "https://del.icio.us" (get nexturl "href")) + None)))) + +(defn process-request [url] + (let [req (requests.get url) + html (. req text)] + (process-page html))) + +(defmain [&rest args] + (try + (let [nexturl (get args 1)] + (while nexturl + (def nexturl (process-request nexturl)))))) + diff --git a/bin/extract-enex.hy b/bin/extract-enex.hy new file mode 100644 index 0000000..9d66436 --- /dev/null +++ b/bin/extract-enex.hy @@ -0,0 +1,58 @@ +#!/usr/local/bin/hy + +(def *version* "0.0.2") + +(import os re sys html2text + [slugify [slugify]] + [datetime [datetime]] + [xml.etree.ElementTree :as ET]) + +(defn process-date [ndate] + (let [d (.strptime datetime ndate "%Y%m%dT%H%M%SZ")] + (.strftime d "%Y-%m-%d %a %H:%M"))) + +(def body-wrap (re.compile "^.*(.*?)")) +(def htmparser (.HTML2Text html2text)) + +(def post-body-clean-re (re.compile "^\* \* \*")) + +(defn process-body [body] + (let [post-body (.sub body-wrap "\\1" (. body text)) + markdown (htmparser.handle (.sub post-body-clean-re "" post-body))] + (->> markdown + (.sub post-body-clean-re "")))) + +(defn process-note [note] + (let [url (if (not (= None (note.find ".//source-url"))) + (. (note.find ".//source-url") text) + None) + title (. (note.find "title") text) + created-date (. (note.find "created") text) + updated-date (if (note.find "updated") (. (note.find "updated") text) "") + tags (+ ":" (.join ":" (map (fn [a] (. a text)) (note.findall "tag"))) ":") + body (note.find "content")] + (, title (.join "\n" (+ ["#+STARTUP: showall " + "" + (if url + (.format "** [[{}][{}]] {}" url title tags) + (.format "** {} {}" title tags)) + ":PROPERTIES:" + (.format ":created: [{}]" (process-date created-date))] + (if updated-date + [(.format ":updated: [{}]" (process-date updated-date))] []) + [":END" "" (process-body body) ""]))))) + +(defmain [&rest args] + (try + (let [filename (get args 1) + tree (ET.parse filename) + root (.getroot tree) + notes (root.iter "note")] + (with [bmarks (open "Bookmarks.org" "a")] + (.write bmarks "* Bookmarks\n\n") + (for [note notes] + (let [(, title content) (process-note note) + slug (slugify title)] + (with [hndl (open (.format "{}.org" (slugify title)) "w")] + (.write hndl content)) + (.write bmarks (.format "** [[file:./{}.org][{}]]\n\n" (slugify title) title))))))))