extractbookmarks/bin/extract-delicious.hy

#!/usr/local/bin/hy

(def *version* "0.0.2")

(import os re sys html2text
        requests
        [slugify [slugify]]
        [datetime [datetime]]
        [bs4 [BeautifulSoup]]
        [xml.etree.ElementTree :as ET])

(def search-date (re.compile "saved by \S+ on (.*)"))

(defn extract-date [infoblock]
  (let [datetext (.group (.search search-date infoblock) 1)
        dateparse (.strptime datetime datetext "%B %d, %Y")]
    (.strftime dateparse "%Y-%m-%d %a 00:00")))

(defn extract-tags [tags]
  (map (fn [li] (. li text)) (.find-all tags "li")))

(defn get-details [article]
  (let [anchor (. (.find article "h3") a)
        title (. anchor text)
        info (.find article "div" :class "articleInfoPan")
        url (-> info (. p) (. a) (. text))
        created-date (extract-date (. info text))
        desc (.find article "div" :class "thumbTBriefTxt")
        rawtags (list (extract-tags desc))
        comment (.join " " (map (fn [i] (. i text)) (.find-all desc "p")))
        tags (if (> (len rawtags) 0) (+ ":" (.join ":" rawtags) ":") "")] 
    (print (.format "** [[{}][{}] {}" url title tags))
    (print ":PROPERTIES:")
    (print (.format ":created: [{}]" created-date))
    (print ":END")
    (print "")
    (if (> (len comment) 0)
      (do (print comment)
          (print "")))))

(defn process-page [html]
  (let [soup (BeautifulSoup html "lxml")
        articles (.find-all soup "div" :class "articleThumbBlockOuter")]
    (for [article articles] (get-details article))
    (let [nexturl (.find soup "a" {"aria-label" "Next"})]
      (if nexturl
        (+ "https://del.icio.us" (get nexturl "href"))
        None))))

(defn process-request [url]
  (let [req (requests.get url)
        html (. req text)]
    (process-page html)))

(defmain [&rest args]
  (try
   (let [nexturl (get args 1)]
     (while nexturl
       (def nexturl (process-request nexturl))))))
Kinda helps if I include the executables, huh? 2017-06-03 19:50:43 +00:00			`#!/usr/local/bin/hy`

			`(def version "0.0.2")`

			`(import os re sys html2text`
			`requests`
			`[slugify [slugify]]`
			`[datetime [datetime]]`
			`[bs4 [BeautifulSoup]]`
			`[xml.etree.ElementTree :as ET])`

			`(def search-date (re.compile "saved by \S+ on (.*)"))`

			`(defn extract-date [infoblock]`
			`(let [datetext (.group (.search search-date infoblock) 1)`
			`dateparse (.strptime datetime datetext "%B %d, %Y")]`
			`(.strftime dateparse "%Y-%m-%d %a 00:00")))`

			`(defn extract-tags [tags]`
			`(map (fn [li] (. li text)) (.find-all tags "li")))`

			`(defn get-details [article]`
			`(let [anchor (. (.find article "h3") a)`
			`title (. anchor text)`
			`info (.find article "div" :class "articleInfoPan")`
			`url (-> info (. p) (. a) (. text))`
			`created-date (extract-date (. info text))`
			`desc (.find article "div" :class "thumbTBriefTxt")`
			`rawtags (list (extract-tags desc))`
			`comment (.join " " (map (fn [i] (. i text)) (.find-all desc "p")))`
			`tags (if (> (len rawtags) 0) (+ ":" (.join ":" rawtags) ":") "")]`
			`(print (.format "** [[{}][{}] {}" url title tags))`
			`(print ":PROPERTIES:")`
			`(print (.format ":created: [{}]" created-date))`
			`(print ":END")`
			`(print "")`
			`(if (> (len comment) 0)`
			`(do (print comment)`
			`(print "")))))`

			`(defn process-page [html]`
			`(let [soup (BeautifulSoup html "lxml")`
			`articles (.find-all soup "div" :class "articleThumbBlockOuter")]`
			`(for [article articles] (get-details article))`
			`(let [nexturl (.find soup "a" {"aria-label" "Next"})]`
			`(if nexturl`
			`(+ "https://del.icio.us" (get nexturl "href"))`
			`None))))`

			`(defn process-request [url]`
			`(let [req (requests.get url)`
			`html (. req text)]`
			`(process-page html)))`

			`(defmain [&rest args]`
			`(try`
			`(let [nexturl (get args 1)]`
			`(while nexturl`
			`(def nexturl (process-request nexturl))))))`