yokogiri

1.5.8


Barebones Nokogiri for Clojure

dependencies

org.clojure/clojure
1.5.1
net.sourceforge.htmlunit/htmlunit
2.13
se.fishtank/css-selectors
1.0.2



(this space intentionally left almost blank)
 
(ns yokogiri.core
  (:require [clojure.java.io :as io])
  (:import [com.gargoylesoftware.htmlunit StringWebResponse WebClient BrowserVersion WebClientOptions]
           [com.gargoylesoftware.htmlunit.html HtmlPage DomNode DomAttr HTMLParser]
           [org.w3c.dom NamedNodeMap Node]
           [se.fishtank.css.selectors.dom DOMNodeSelector]))
(set! *warn-on-reflection* true)

Returns the client options object for a WebClient.

(defn- web-client-options
  [^WebClient client] (.getOptions client))
(def set-client-options-map
  {:activex-native                  #(.setActiveXNative                     ^WebClientOptions %1 %2)
   :applet                          #(.setAppletEnabled                     ^WebClientOptions %1 %2)
   :block-popups                    #(.setPopupBlockerEnabled               ^WebClientOptions %1 %2)
   :css                             #(.setCssEnabled                        ^WebClientOptions %1 %2)
   :geolocation                     #(.setGeolocationEnabled                ^WebClientOptions %1 %2)
   :homepage                        #(.setHomePage                          ^WebClientOptions %1 %2)
   :insecure-ssl                    #(.setUseInsecureSSL                    ^WebClientOptions %1 %2)
   :print-content-on-failing-status #(.setPrintContentOnFailingStatusCode   ^WebClientOptions %1 %2)
   :redirects                       #(.setRedirectEnabled                   ^WebClientOptions %1 %2)
   :throw-on-failing-status         #(.setThrowExceptionOnFailingStatusCode ^WebClientOptions %1 %2)
   :throw-on-script-error           #(.setThrowExceptionOnScriptError       ^WebClientOptions %1 %2)
   :timeout                         #(.setTimeout                           ^WebClientOptions %1 %2)
   :tracking                        #(.setDoNotTrackEnabled                 ^WebClientOptions %1 %2)
   :javascript                      #(.setJavaScriptEnabled                 ^WebClientOptions %1 %2)})
(declare ^:dynamic *client*)

Sets options on the client.

Usage:

(let [client (make-client)]
  (set-client-options! client {:redirects false}))
;=> #<WebClient com.gargoylesoftware.htmlunit.WebClient@7622ccf2>

Available Options:

:activex-native                   bool
:applet                           bool
:css                              bool
:geolocation                      bool
:insecure-ssl                     bool
:print-content-on-failing-status  bool
:redirects                        bool
:throw-on-failing-status          bool
:throw-on-script-error            bool
:tracking                         bool
:javascript                       bool
:homepage                         string
:timeout                          integer
(defn set-client-options!
  ([opts] (set-client-options! *client* opts))
  ([^WebClient client opts]
     (let [^WebClientOptions client-opts (web-client-options client)]
       (doseq [[k v] opts]
         (let [setter-fn (get set-client-options-map k)]
           (setter-fn client-opts v)))
       client)))

Returns a map of all options currently set on a client.

Usage:

user> (let [client (make-client :redirects false)]
        (get-client-options client))
;=> {:javascript true, :redirects false, ...}
(defn get-client-options
  [^WebClient client]
  (let [^WebClientOptions opts (web-client-options ^WebClient client)]
    {:activex-native                       (. opts isActiveXNative)
     :applet                               (. opts isAppletEnabled)
     :block-popups                         (. opts isPopupBlockerEnabled)
     :css                                  (. opts isCssEnabled)
     :geolocation                          (. opts isGeolocationEnabled)
     :homepage                             (. opts getHomePage)
     :insecure-ssl                         (. opts isUseInsecureSSL)
     :javascript                           (. opts isJavaScriptEnabled)
     :print-content-on-failing-status-code (. opts getPrintContentOnFailingStatusCode)
     :redirects                            (. opts isRedirectEnabled)
     :throw-on-failing-status              (. opts isThrowExceptionOnFailingStatusCode)
     :throw-on-script-error                (. opts isThrowExceptionOnScriptError)
     :timeout                              (. opts getTimeout)
     :tracking                             (. opts isDoNotTrackEnabled)}))

Constructs a new WebClient.

Usage:

user> (make-client)
;=> #<WebClient com.gargoylesoftware.htmlunit.WebClient@124d43a8>

With Options:

user> (make-client :geolocation true
                   :block-popups false)
;=> #<WebClient com.gargoylesoftware.htmlunit.WebClient@4473f04f>

Available Options:

:activex-native                   bool
:applet                           bool
:css                              bool
:geolocation                      bool
:insecure-ssl                     bool
:print-content-on-failing-status  bool
:redirects                        bool
:throw-on-failing-status          bool
:throw-on-script-error            bool
:tracking                         bool
:javascript                       bool
:homepage                         string
:timeout                          integer

See also: yokogiri.core/set-client-options!

(defn make-client
  [& {:as opts}]
  (let [client (new WebClient)]
    (if-not (empty? opts)
      (set-client-options! (new WebClient) opts)
      client)))
(defonce ^:dynamic *client* (make-client))

Takes a client which will be bound to client within the scope of the form.

Usage:

user> (with-client (make-client :javascript false)
        (get-page "http://www.example.com/"))
;=> #<HtmlPage HtmlPage(http://www.example.com/)@1536532984>
(defmacro with-client
  [c & body]
  `(binding [*client* ~c]
     ~@body))

Takes a string, returns an HtmlPage.

Usage:

user> (create-page "<html><body><a href=\"http://example.com\">Link</a></body></html>")
;=> #<HtmlPage HtmlPage(file://fake-response-url)@478170219>
(defn create-page
  "Takes a string, returns an HtmlPage.
  **Usage:**
    user> (create-page \"<html><body><a href=\\\"http://example.com\\\">Link</a></body></html>\")
    ;=> #<HtmlPage HtmlPage(file://fake-response-url)@478170219>"
  [xml]
  (let [url (io/as-url "file://fake-response-url")
        response (StringWebResponse. xml url)]
    (HTMLParser/parseHtml response (.getCurrentWindow (WebClient.)))))

Takes a client and a url, returns an HtmlPage.

Usage:

user> (get-page (make-client) "http://www.example.com/")
;=> #<HtmlPage HtmlPage(http://www.example.com/)@478170219>
(defn get-page
  ([url] (get-page *client* url))
  ([^WebClient client, ^String url]
     (.getPage ^WebClient client url)))

Takes a path as a string and creates a Page you can access with #'yokogiri.core/xpath, #'yokogiri.core/css, etc.

Usage:

user> (as-page "http://www.example.com/")
;=> #<HtmlPage HtmlPage(file:/home/user/yokogiri/docs/uberdoc.html)@171016649>
(defn as-page
  ([path] (as-page *client* path))
  ([client path] (->> path io/file io/as-url str (get-page client))))

Takes an HtmlPage and an xpath string. Returns a vector of nodes which match the provided xpath string.

Usage:

user> (let [page (get-page your-client "http://www.example.com")]
        (xpath page "//a"))
;=> [#<HtmlAnchor HtmlAnchor[<a href="http://www.iana.org/domains/example">]>]
(defn xpath
  [^HtmlPage page, ^String xpath]
  (into [] (.getByXPath page xpath)))

Takes an HtmlPage and an xpath string. Returns the first matching node which matches the provided xpath string.

Usage:

user> (first-by-xpath
        (get-page your-client "http://www.example.com/")
        "//a")
;=> #<HtmlAnchor HtmlAnchor[<a href="http://www.iana.org/domains/example">]>
(defn first-by-xpath
  [^HtmlPage page, ^String xpath]
  (.getFirstByXPath page xpath))

Returns matches for a given CSS selector

Usage:

user> (css your-client "a.gbzt")
;=> (#<HtmlAnchor HtmlAnchor[<a onclick...>]>, ...)

http://www.goodercode.com/wp/use-css-selectors-with-htmlunit/ TODO: Bumping the version of css-selectors to 1.0.4 breaks querying by CSS.

(defn css
  [^HtmlPage page, ^String selector]
  (let [queryable-page (DOMNodeSelector. (. page getDocumentElement))]
    (seq (. queryable-page querySelectorAll selector))))

Returns a node's XML representation.

Usage:

user> (node-xml
        (first-by-xpath
          (get-page (make-client) "http://www.example.com/")
         "//a"))
;=> <a href="http://www.iana.org/domains/example">\

More information...\ \

(defn node-xml
  "Returns a node's XML representation.
  **Usage:**
    user> (node-xml
            (first-by-xpath
              (get-page (make-client) \"http://www.example.com/\")
             \"//a\"))
    ;=> <a href=\"http://www.iana.org/domains/example\">\\\n  More information...\\\n</a>\\\n"
  [^DomNode node]
  (.asXml node))

Returns a node's text value

Usage:

user> (node-text #<HtmlAnchor HtmlAnchor[<a class="foo" id="bar" href="http://example.com">]>)
;=> "Search"
(defn node-text
  [^DomNode node]
  (.asText node))

Returns a clojure map of attributes for a given node

Usage:

user> (attr-map #<HtmlAnchor HtmlAnchor[<a class="foo" id="bar" href="http://example.com">]>)
;=> {:text "Search", :href "http://example.com", :id "bar", :class "foo"}

See also: yokogiri.core/attrs

(defn attr-map
  [^DomNode node]
  (let [^NamedNodeMap attrs (.getAttributes node)]
    (loop [acc 0, res {}]
      (if (= acc (.getLength attrs))
        (assoc res :text (node-text node))
        (recur (inc acc)
               (let [^DomAttr attr (.item attrs acc)]
                 (assoc res (keyword (.getName attr)) (.getValue attr))))))))

See also: yokogiri.core/attr-map

(def  attrs #'yokogiri.core/attr-map)

Returns the HtmlUnit DomAttr objects for a given node

See also: yokogiri.core/attr-map

TODO: http://htmlunit.sourceforge.net/apidocs/com/gargoylesoftware/htmlunit/html/DomAttr.html

(defn- dom-attr
  [^DomNode node]
  (let [^NamedNodeMap attrs (.getAttributes node)
        len (.getLength attrs)]
    (map #(.item attrs %) (range 0 len))))
(comment
  (def c (make-client))
  (def p (get-page c "http://www.example.com/"))
  (xpath p "//a")
  (map attrs (css p "p")))