Parse HTML to an XML document via xml-conduit
This commit is contained in:
parent
5e068b32b7
commit
940c1dddcd
65
yesod-test/Yesod/Test/HtmlParse.hs
Normal file
65
yesod-test/Yesod/Test/HtmlParse.hs
Normal file
@ -0,0 +1,65 @@
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
-- | Parse an HTML document into xml-conduit's Document.
|
||||
--
|
||||
-- Assumes UTF-8 encoding.
|
||||
module Yesod.Test.HtmlParse
|
||||
( parseHtml
|
||||
) where
|
||||
|
||||
import Text.HTML.TagStream
|
||||
import qualified Data.ByteString as S
|
||||
import qualified Data.ByteString.Lazy as L
|
||||
import Text.XML
|
||||
import Data.Conduit
|
||||
import qualified Data.Conduit.List as CL
|
||||
import Data.Functor.Identity (runIdentity)
|
||||
import Control.Monad.Trans.Resource (runExceptionT)
|
||||
import Data.XML.Types (Event (..), Content (ContentText))
|
||||
import Control.Arrow ((***))
|
||||
import Data.Text.Encoding (decodeUtf8With)
|
||||
import Data.Text.Encoding.Error (lenientDecode)
|
||||
import qualified Data.Set as Set
|
||||
|
||||
parseHtml :: L.ByteString -> Either String Document
|
||||
parseHtml lbs =
|
||||
either (Left . show) Right
|
||||
$ runIdentity
|
||||
$ runExceptionT
|
||||
$ CL.sourceList (L.toChunks lbs)
|
||||
$$ tokenStream =$ (CL.concatMap toEvent =$ fromEvents)
|
||||
|
||||
toEvent :: Token -> [Event]
|
||||
toEvent (TagOpen bsname bsattrs isClose') =
|
||||
EventBeginElement name attrs : if isClose then [EventEndElement name] else []
|
||||
where
|
||||
name = toName bsname
|
||||
attrs = map (toName *** (return . ContentText . decodeUtf8With lenientDecode)) bsattrs
|
||||
isClose = isClose' || isVoid bsname
|
||||
toEvent (TagClose bsname) = [EventEndElement $ toName bsname]
|
||||
toEvent (Text bs) = [EventContent $ ContentText $ decodeUtf8With lenientDecode bs]
|
||||
toEvent (Comment bs) = [EventComment $ decodeUtf8With lenientDecode bs]
|
||||
toEvent Special{} = []
|
||||
toEvent Incomplete{} = []
|
||||
|
||||
toName :: S.ByteString -> Name
|
||||
toName bs = Name (decodeUtf8With lenientDecode bs) Nothing Nothing
|
||||
|
||||
isVoid :: S.ByteString -> Bool
|
||||
isVoid = flip Set.member $ Set.fromList
|
||||
[ "area"
|
||||
, "base"
|
||||
, "br"
|
||||
, "col"
|
||||
, "command"
|
||||
, "embed"
|
||||
, "hr"
|
||||
, "img"
|
||||
, "input"
|
||||
, "keygen"
|
||||
, "link"
|
||||
, "meta"
|
||||
, "param"
|
||||
, "source"
|
||||
, "track"
|
||||
, "wbr"
|
||||
]
|
||||
@ -5,9 +5,12 @@ import Test.Hspec.HUnit ()
|
||||
|
||||
import Yesod.Test.CssQuery
|
||||
import Yesod.Test.TransversingCSS
|
||||
import Yesod.Test.HtmlParse
|
||||
import Text.XML
|
||||
|
||||
parseQuery_ = either error id . parseQuery
|
||||
findBySelector_ x = either error id . findBySelector x
|
||||
parseHtml_ = either error id . parseHtml
|
||||
|
||||
main :: IO ()
|
||||
main = hspecX $ do
|
||||
@ -24,3 +27,33 @@ main = hspecX $ do
|
||||
let html = "<html><head><title>foo</title></head><body><br><p>Hello World</p></body></html>"
|
||||
query = "body > p"
|
||||
in findBySelector_ html query @?= ["<p>Hello World</p>"]
|
||||
describe "HTML parsing" $ do
|
||||
it "XHTML" $
|
||||
let html = "<html><head><title>foo</title></head><body><p>Hello World</p></body></html>"
|
||||
doc = Document (Prologue [] Nothing []) root []
|
||||
root = Element "html" []
|
||||
[ NodeElement $ Element "head" []
|
||||
[ NodeElement $ Element "title" []
|
||||
[NodeContent "foo"]
|
||||
]
|
||||
, NodeElement $ Element "body" []
|
||||
[ NodeElement $ Element "p" []
|
||||
[NodeContent "Hello World"]
|
||||
]
|
||||
]
|
||||
in parseHtml_ html @?= doc
|
||||
it "HTML" $
|
||||
let html = "<html><head><title>foo</title></head><body><br><p>Hello World</p></body></html>"
|
||||
doc = Document (Prologue [] Nothing []) root []
|
||||
root = Element "html" []
|
||||
[ NodeElement $ Element "head" []
|
||||
[ NodeElement $ Element "title" []
|
||||
[NodeContent "foo"]
|
||||
]
|
||||
, NodeElement $ Element "body" []
|
||||
[ NodeElement $ Element "br" [] []
|
||||
, NodeElement $ Element "p" []
|
||||
[NodeContent "Hello World"]
|
||||
]
|
||||
]
|
||||
in parseHtml_ html @?= doc
|
||||
|
||||
@ -34,9 +34,16 @@ library
|
||||
, bytestring >= 0.9
|
||||
, case-insensitive >= 0.2
|
||||
, text
|
||||
, tagstream-conduit >= 0.3 && < 0.4
|
||||
, conduit >= 0.4 && < 0.5
|
||||
, resourcet >= 0.3 && < 0.4
|
||||
, xml-conduit >= 0.7 && < 0.8
|
||||
, xml-types >= 0.3 && < 0.4
|
||||
, containers
|
||||
exposed-modules: Yesod.Test
|
||||
Yesod.Test.CssQuery
|
||||
Yesod.Test.TransversingCSS
|
||||
Yesod.Test.HtmlParse
|
||||
ghc-options: -Wall
|
||||
|
||||
test-suite test
|
||||
@ -47,6 +54,7 @@ test-suite test
|
||||
, yesod-test
|
||||
, hspec >= 0.9 && < 0.10
|
||||
, HUnit
|
||||
, xml-conduit
|
||||
|
||||
source-repository head
|
||||
type: git
|
||||
|
||||
Loading…
Reference in New Issue
Block a user