extractbookmarks/bin/extract-delicious.hy

61 lines
1.8 KiB
Hy
Raw Permalink Normal View History

#!/usr/local/bin/hy
(def *version* "0.0.2")
(import os re sys html2text
requests
[slugify [slugify]]
[datetime [datetime]]
[bs4 [BeautifulSoup]]
[xml.etree.ElementTree :as ET])
(def search-date (re.compile "saved by \S+ on (.*)"))
(defn extract-date [infoblock]
(let [datetext (.group (.search search-date infoblock) 1)
dateparse (.strptime datetime datetext "%B %d, %Y")]
(.strftime dateparse "%Y-%m-%d %a 00:00")))
(defn extract-tags [tags]
(map (fn [li] (. li text)) (.find-all tags "li")))
(defn get-details [article]
(let [anchor (. (.find article "h3") a)
title (. anchor text)
info (.find article "div" :class "articleInfoPan")
url (-> info (. p) (. a) (. text))
created-date (extract-date (. info text))
desc (.find article "div" :class "thumbTBriefTxt")
rawtags (list (extract-tags desc))
comment (.join " " (map (fn [i] (. i text)) (.find-all desc "p")))
tags (if (> (len rawtags) 0) (+ ":" (.join ":" rawtags) ":") "")]
(print (.format "** [[{}][{}] {}" url title tags))
(print ":PROPERTIES:")
(print (.format ":created: [{}]" created-date))
(print ":END")
(print "")
(if (> (len comment) 0)
(do (print comment)
(print "")))))
(defn process-page [html]
(let [soup (BeautifulSoup html "lxml")
articles (.find-all soup "div" :class "articleThumbBlockOuter")]
(for [article articles] (get-details article))
(let [nexturl (.find soup "a" {"aria-label" "Next"})]
(if nexturl
(+ "https://del.icio.us" (get nexturl "href"))
None))))
(defn process-request [url]
(let [req (requests.get url)
html (. req text)]
(process-page html)))
(defmain [&rest args]
(try
(let [nexturl (get args 1)]
(while nexturl
(def nexturl (process-request nexturl))))))