extractbookmarks/bin/extract-delicious.hy

#!/usr/local/bin/hy

(def *version* "0.0.2")

(import os re sys html2text
        requests
        [slugify [slugify]]
        [datetime [datetime]]
        [bs4 [BeautifulSoup]]
        [xml.etree.ElementTree :as ET])

(def search-date (re.compile "saved by \S+ on (.*)"))

(defn extract-date [infoblock]
  (let [datetext (.group (.search search-date infoblock) 1)
        dateparse (.strptime datetime datetext "%B %d, %Y")]
    (.strftime dateparse "%Y-%m-%d %a 00:00")))

(defn extract-tags [tags]
  (map (fn [li] (. li text)) (.find-all tags "li")))

(defn get-details [article]
  (let [anchor (. (.find article "h3") a)
        title (. anchor text)
        info (.find article "div" :class "articleInfoPan")
        url (-> info (. p) (. a) (. text))
        created-date (extract-date (. info text))
        desc (.find article "div" :class "thumbTBriefTxt")
        rawtags (list (extract-tags desc))
        comment (.join " " (map (fn [i] (. i text)) (.find-all desc "p")))
        tags (if (> (len rawtags) 0) (+ ":" (.join ":" rawtags) ":") "")]
    (print (.format "** [[{}][{}] {}" url title tags))
    (print ":PROPERTIES:")
    (print (.format ":created: [{}]" created-date))
    (print ":END")
    (print "")
    (if (> (len comment) 0)
      (do (print comment)
          (print "")))))

(defn process-page [html]
  (let [soup (BeautifulSoup html "lxml")
        articles (.find-all soup "div" :class "articleThumbBlockOuter")]
    (for [article articles] (get-details article))
    (let [nexturl (.find soup "a" {"aria-label" "Next"})]
      (if nexturl
        (+ "https://del.icio.us" (get nexturl "href"))
        None))))

(defn process-request [url]
  (let [req (requests.get url)
        html (. req text)]
    (process-page html)))

(defmain [&rest args]
  (try
   (let [nexturl (get args 1)]
     (while nexturl
       (def nexturl (process-request nexturl))))))