yokogiri1.5.8Barebones Nokogiri for Clojure dependencies
| (this space intentionally left almost blank) | |||||||||
(ns yokogiri.core (:require [clojure.java.io :as io]) (:import [com.gargoylesoftware.htmlunit StringWebResponse WebClient BrowserVersion WebClientOptions] [com.gargoylesoftware.htmlunit.html HtmlPage DomNode DomAttr HTMLParser] [org.w3c.dom NamedNodeMap Node] [se.fishtank.css.selectors.dom DOMNodeSelector])) | ||||||||||
(set! *warn-on-reflection* true) | ||||||||||
Returns the client options object for a WebClient. | (defn- web-client-options [^WebClient client] (.getOptions client)) | |||||||||
(def set-client-options-map {:activex-native #(.setActiveXNative ^WebClientOptions %1 %2) :applet #(.setAppletEnabled ^WebClientOptions %1 %2) :block-popups #(.setPopupBlockerEnabled ^WebClientOptions %1 %2) :css #(.setCssEnabled ^WebClientOptions %1 %2) :geolocation #(.setGeolocationEnabled ^WebClientOptions %1 %2) :homepage #(.setHomePage ^WebClientOptions %1 %2) :insecure-ssl #(.setUseInsecureSSL ^WebClientOptions %1 %2) :print-content-on-failing-status #(.setPrintContentOnFailingStatusCode ^WebClientOptions %1 %2) :redirects #(.setRedirectEnabled ^WebClientOptions %1 %2) :throw-on-failing-status #(.setThrowExceptionOnFailingStatusCode ^WebClientOptions %1 %2) :throw-on-script-error #(.setThrowExceptionOnScriptError ^WebClientOptions %1 %2) :timeout #(.setTimeout ^WebClientOptions %1 %2) :tracking #(.setDoNotTrackEnabled ^WebClientOptions %1 %2) :javascript #(.setJavaScriptEnabled ^WebClientOptions %1 %2)}) | ||||||||||
(declare ^:dynamic *client*) | ||||||||||
Sets options on the client. Usage:
Available Options:
| (defn set-client-options! ([opts] (set-client-options! *client* opts)) ([^WebClient client opts] (let [^WebClientOptions client-opts (web-client-options client)] (doseq [[k v] opts] (let [setter-fn (get set-client-options-map k)] (setter-fn client-opts v))) client))) | |||||||||
Returns a map of all options currently set on a client. Usage:
| (defn get-client-options [^WebClient client] (let [^WebClientOptions opts (web-client-options ^WebClient client)] {:activex-native (. opts isActiveXNative) :applet (. opts isAppletEnabled) :block-popups (. opts isPopupBlockerEnabled) :css (. opts isCssEnabled) :geolocation (. opts isGeolocationEnabled) :homepage (. opts getHomePage) :insecure-ssl (. opts isUseInsecureSSL) :javascript (. opts isJavaScriptEnabled) :print-content-on-failing-status-code (. opts getPrintContentOnFailingStatusCode) :redirects (. opts isRedirectEnabled) :throw-on-failing-status (. opts isThrowExceptionOnFailingStatusCode) :throw-on-script-error (. opts isThrowExceptionOnScriptError) :timeout (. opts getTimeout) :tracking (. opts isDoNotTrackEnabled)})) | |||||||||
Constructs a new WebClient. Usage:
With Options:
Available Options:
See also: yokogiri.core/set-client-options! | (defn make-client [& {:as opts}] (let [client (new WebClient)] (if-not (empty? opts) (set-client-options! (new WebClient) opts) client))) | |||||||||
(defonce ^:dynamic *client* (make-client)) | ||||||||||
Takes a client which will be bound to client within the scope of the form. Usage:
| (defmacro with-client [c & body] `(binding [*client* ~c] ~@body)) | |||||||||
Takes a string, returns an HtmlPage. Usage:
| (defn create-page "Takes a string, returns an HtmlPage. **Usage:** user> (create-page \"<html><body><a href=\\\"http://example.com\\\">Link</a></body></html>\") ;=> #<HtmlPage HtmlPage(file://fake-response-url)@478170219>" [xml] (let [url (io/as-url "file://fake-response-url") response (StringWebResponse. xml url)] (HTMLParser/parseHtml response (.getCurrentWindow (WebClient.))))) | |||||||||
Takes a client and a url, returns an HtmlPage. Usage:
| (defn get-page ([url] (get-page *client* url)) ([^WebClient client, ^String url] (.getPage ^WebClient client url))) | |||||||||
Takes a path as a string and creates a Page you can access with #'yokogiri.core/xpath, #'yokogiri.core/css, etc. Usage:
| (defn as-page ([path] (as-page *client* path)) ([client path] (->> path io/file io/as-url str (get-page client)))) | |||||||||
Takes an HtmlPage and an xpath string. Returns a vector of nodes which match the provided xpath string. Usage:
| (defn xpath [^HtmlPage page, ^String xpath] (into [] (.getByXPath page xpath))) | |||||||||
Takes an HtmlPage and an xpath string. Returns the first matching node which matches the provided xpath string. Usage:
| (defn first-by-xpath [^HtmlPage page, ^String xpath] (.getFirstByXPath page xpath)) | |||||||||
Returns matches for a given CSS selector Usage:
http://www.goodercode.com/wp/use-css-selectors-with-htmlunit/ TODO: Bumping the version of css-selectors to 1.0.4 breaks querying by CSS. | (defn css [^HtmlPage page, ^String selector] (let [queryable-page (DOMNodeSelector. (. page getDocumentElement))] (seq (. queryable-page querySelectorAll selector)))) | |||||||||
Returns a node's XML representation. Usage:
More information...\ \ | (defn node-xml "Returns a node's XML representation. **Usage:** user> (node-xml (first-by-xpath (get-page (make-client) \"http://www.example.com/\") \"//a\")) ;=> <a href=\"http://www.iana.org/domains/example\">\\\n More information...\\\n</a>\\\n" [^DomNode node] (.asXml node)) | |||||||||
Returns a node's text value Usage:
| (defn node-text [^DomNode node] (.asText node)) | |||||||||
Returns a clojure map of attributes for a given node Usage:
See also: yokogiri.core/attrs | (defn attr-map [^DomNode node] (let [^NamedNodeMap attrs (.getAttributes node)] (loop [acc 0, res {}] (if (= acc (.getLength attrs)) (assoc res :text (node-text node)) (recur (inc acc) (let [^DomAttr attr (.item attrs acc)] (assoc res (keyword (.getName attr)) (.getValue attr)))))))) | |||||||||
See also: yokogiri.core/attr-map | (def attrs #'yokogiri.core/attr-map) | |||||||||
Returns the HtmlUnit DomAttr objects for a given node See also: yokogiri.core/attr-map TODO: http://htmlunit.sourceforge.net/apidocs/com/gargoylesoftware/htmlunit/html/DomAttr.html | (defn- dom-attr [^DomNode node] (let [^NamedNodeMap attrs (.getAttributes node) len (.getLength attrs)] (map #(.item attrs %) (range 0 len)))) | |||||||||
(comment (def c (make-client)) (def p (get-page c "http://www.example.com/")) (xpath p "//a") (map attrs (css p "p"))) | ||||||||||