2012年5月28日月曜日

[Racket] HTMLのパース

RacketでHTMLをパースするにはhtmlモジュールとxmlモジュールを使います。
#lang racket

(require
 (prefix-in url: net/url)
 (prefix-in h: html)
 (prefix-in x: xml))

(define (get-attribute name elem)
  (let loop ((rest (h:html-element-attributes elem)))
    (cond
     [(null? rest) #f]
     [(eq? name (x:attribute-name (car rest)))
      (x:attribute-value (car rest))]
     [else (loop (cdr rest))])))

(define (get-element-by-id id elem)
  (match elem
    ([struct h:html-full (attributes content)]
     (let ((val (get-attribute 'id elem)))
       (if (and val (string=? val id))
    elem
    (let loop ((rest content))
      (if (null? rest)
   #f
   (or (get-element-by-id id (car rest))
       (loop (cdr rest))))))))
    ([struct h:html-element (attributes)]
     (let ((val (get-attribute 'id elem)))
       (if (and val (string=? val id))
    val
    #f)))
    (else #f)))

(define (get-html url-string)
  (h:read-html
   (url:get-pure-port
    (url:string->url url-string))))

(let ((obj (get-html "http://docs.racket-lang.org/html/index.html")))
  (let ((body (get-element-by-id "doc-racket-lang-org" obj)))
    (match body
      ([struct h:html-full (attributes content)]
       (displayln attributes)
       (displayln content)))))

0 件のコメント:

コメントを投稿