Parse HTML to an XML document via xml-conduit

This commit is contained in:
Michael Snoyman 2012-03-28 08:43:36 +02:00
parent 5e068b32b7
commit 940c1dddcd
3 changed files with 106 additions and 0 deletions

View File

@ -0,0 +1,65 @@
{-# LANGUAGE OverloadedStrings #-}
-- | Parse an HTML document into xml-conduit's Document.
--
-- Assumes UTF-8 encoding.
module Yesod.Test.HtmlParse
( parseHtml
) where
import Text.HTML.TagStream
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as L
import Text.XML
import Data.Conduit
import qualified Data.Conduit.List as CL
import Data.Functor.Identity (runIdentity)
import Control.Monad.Trans.Resource (runExceptionT)
import Data.XML.Types (Event (..), Content (ContentText))
import Control.Arrow ((***))
import Data.Text.Encoding (decodeUtf8With)
import Data.Text.Encoding.Error (lenientDecode)
import qualified Data.Set as Set
parseHtml :: L.ByteString -> Either String Document
parseHtml lbs =
either (Left . show) Right
$ runIdentity
$ runExceptionT
$ CL.sourceList (L.toChunks lbs)
$$ tokenStream =$ (CL.concatMap toEvent =$ fromEvents)
toEvent :: Token -> [Event]
toEvent (TagOpen bsname bsattrs isClose') =
EventBeginElement name attrs : if isClose then [EventEndElement name] else []
where
name = toName bsname
attrs = map (toName *** (return . ContentText . decodeUtf8With lenientDecode)) bsattrs
isClose = isClose' || isVoid bsname
toEvent (TagClose bsname) = [EventEndElement $ toName bsname]
toEvent (Text bs) = [EventContent $ ContentText $ decodeUtf8With lenientDecode bs]
toEvent (Comment bs) = [EventComment $ decodeUtf8With lenientDecode bs]
toEvent Special{} = []
toEvent Incomplete{} = []
toName :: S.ByteString -> Name
toName bs = Name (decodeUtf8With lenientDecode bs) Nothing Nothing
isVoid :: S.ByteString -> Bool
isVoid = flip Set.member $ Set.fromList
[ "area"
, "base"
, "br"
, "col"
, "command"
, "embed"
, "hr"
, "img"
, "input"
, "keygen"
, "link"
, "meta"
, "param"
, "source"
, "track"
, "wbr"
]

View File

@ -5,9 +5,12 @@ import Test.Hspec.HUnit ()
import Yesod.Test.CssQuery
import Yesod.Test.TransversingCSS
import Yesod.Test.HtmlParse
import Text.XML
parseQuery_ = either error id . parseQuery
findBySelector_ x = either error id . findBySelector x
parseHtml_ = either error id . parseHtml
main :: IO ()
main = hspecX $ do
@ -24,3 +27,33 @@ main = hspecX $ do
let html = "<html><head><title>foo</title></head><body><br><p>Hello World</p></body></html>"
query = "body > p"
in findBySelector_ html query @?= ["<p>Hello World</p>"]
describe "HTML parsing" $ do
it "XHTML" $
let html = "<html><head><title>foo</title></head><body><p>Hello World</p></body></html>"
doc = Document (Prologue [] Nothing []) root []
root = Element "html" []
[ NodeElement $ Element "head" []
[ NodeElement $ Element "title" []
[NodeContent "foo"]
]
, NodeElement $ Element "body" []
[ NodeElement $ Element "p" []
[NodeContent "Hello World"]
]
]
in parseHtml_ html @?= doc
it "HTML" $
let html = "<html><head><title>foo</title></head><body><br><p>Hello World</p></body></html>"
doc = Document (Prologue [] Nothing []) root []
root = Element "html" []
[ NodeElement $ Element "head" []
[ NodeElement $ Element "title" []
[NodeContent "foo"]
]
, NodeElement $ Element "body" []
[ NodeElement $ Element "br" [] []
, NodeElement $ Element "p" []
[NodeContent "Hello World"]
]
]
in parseHtml_ html @?= doc

View File

@ -34,9 +34,16 @@ library
, bytestring >= 0.9
, case-insensitive >= 0.2
, text
, tagstream-conduit >= 0.3 && < 0.4
, conduit >= 0.4 && < 0.5
, resourcet >= 0.3 && < 0.4
, xml-conduit >= 0.7 && < 0.8
, xml-types >= 0.3 && < 0.4
, containers
exposed-modules: Yesod.Test
Yesod.Test.CssQuery
Yesod.Test.TransversingCSS
Yesod.Test.HtmlParse
ghc-options: -Wall
test-suite test
@ -47,6 +54,7 @@ test-suite test
, yesod-test
, hspec >= 0.9 && < 0.10
, HUnit
, xml-conduit
source-repository head
type: git