Parse HTML to an XML document via xml-conduit
This commit is contained in:
parent
5e068b32b7
commit
940c1dddcd
65
yesod-test/Yesod/Test/HtmlParse.hs
Normal file
65
yesod-test/Yesod/Test/HtmlParse.hs
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
-- | Parse an HTML document into xml-conduit's Document.
|
||||||
|
--
|
||||||
|
-- Assumes UTF-8 encoding.
|
||||||
|
module Yesod.Test.HtmlParse
|
||||||
|
( parseHtml
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Text.HTML.TagStream
|
||||||
|
import qualified Data.ByteString as S
|
||||||
|
import qualified Data.ByteString.Lazy as L
|
||||||
|
import Text.XML
|
||||||
|
import Data.Conduit
|
||||||
|
import qualified Data.Conduit.List as CL
|
||||||
|
import Data.Functor.Identity (runIdentity)
|
||||||
|
import Control.Monad.Trans.Resource (runExceptionT)
|
||||||
|
import Data.XML.Types (Event (..), Content (ContentText))
|
||||||
|
import Control.Arrow ((***))
|
||||||
|
import Data.Text.Encoding (decodeUtf8With)
|
||||||
|
import Data.Text.Encoding.Error (lenientDecode)
|
||||||
|
import qualified Data.Set as Set
|
||||||
|
|
||||||
|
parseHtml :: L.ByteString -> Either String Document
|
||||||
|
parseHtml lbs =
|
||||||
|
either (Left . show) Right
|
||||||
|
$ runIdentity
|
||||||
|
$ runExceptionT
|
||||||
|
$ CL.sourceList (L.toChunks lbs)
|
||||||
|
$$ tokenStream =$ (CL.concatMap toEvent =$ fromEvents)
|
||||||
|
|
||||||
|
toEvent :: Token -> [Event]
|
||||||
|
toEvent (TagOpen bsname bsattrs isClose') =
|
||||||
|
EventBeginElement name attrs : if isClose then [EventEndElement name] else []
|
||||||
|
where
|
||||||
|
name = toName bsname
|
||||||
|
attrs = map (toName *** (return . ContentText . decodeUtf8With lenientDecode)) bsattrs
|
||||||
|
isClose = isClose' || isVoid bsname
|
||||||
|
toEvent (TagClose bsname) = [EventEndElement $ toName bsname]
|
||||||
|
toEvent (Text bs) = [EventContent $ ContentText $ decodeUtf8With lenientDecode bs]
|
||||||
|
toEvent (Comment bs) = [EventComment $ decodeUtf8With lenientDecode bs]
|
||||||
|
toEvent Special{} = []
|
||||||
|
toEvent Incomplete{} = []
|
||||||
|
|
||||||
|
toName :: S.ByteString -> Name
|
||||||
|
toName bs = Name (decodeUtf8With lenientDecode bs) Nothing Nothing
|
||||||
|
|
||||||
|
isVoid :: S.ByteString -> Bool
|
||||||
|
isVoid = flip Set.member $ Set.fromList
|
||||||
|
[ "area"
|
||||||
|
, "base"
|
||||||
|
, "br"
|
||||||
|
, "col"
|
||||||
|
, "command"
|
||||||
|
, "embed"
|
||||||
|
, "hr"
|
||||||
|
, "img"
|
||||||
|
, "input"
|
||||||
|
, "keygen"
|
||||||
|
, "link"
|
||||||
|
, "meta"
|
||||||
|
, "param"
|
||||||
|
, "source"
|
||||||
|
, "track"
|
||||||
|
, "wbr"
|
||||||
|
]
|
||||||
@ -5,9 +5,12 @@ import Test.Hspec.HUnit ()
|
|||||||
|
|
||||||
import Yesod.Test.CssQuery
|
import Yesod.Test.CssQuery
|
||||||
import Yesod.Test.TransversingCSS
|
import Yesod.Test.TransversingCSS
|
||||||
|
import Yesod.Test.HtmlParse
|
||||||
|
import Text.XML
|
||||||
|
|
||||||
parseQuery_ = either error id . parseQuery
|
parseQuery_ = either error id . parseQuery
|
||||||
findBySelector_ x = either error id . findBySelector x
|
findBySelector_ x = either error id . findBySelector x
|
||||||
|
parseHtml_ = either error id . parseHtml
|
||||||
|
|
||||||
main :: IO ()
|
main :: IO ()
|
||||||
main = hspecX $ do
|
main = hspecX $ do
|
||||||
@ -24,3 +27,33 @@ main = hspecX $ do
|
|||||||
let html = "<html><head><title>foo</title></head><body><br><p>Hello World</p></body></html>"
|
let html = "<html><head><title>foo</title></head><body><br><p>Hello World</p></body></html>"
|
||||||
query = "body > p"
|
query = "body > p"
|
||||||
in findBySelector_ html query @?= ["<p>Hello World</p>"]
|
in findBySelector_ html query @?= ["<p>Hello World</p>"]
|
||||||
|
describe "HTML parsing" $ do
|
||||||
|
it "XHTML" $
|
||||||
|
let html = "<html><head><title>foo</title></head><body><p>Hello World</p></body></html>"
|
||||||
|
doc = Document (Prologue [] Nothing []) root []
|
||||||
|
root = Element "html" []
|
||||||
|
[ NodeElement $ Element "head" []
|
||||||
|
[ NodeElement $ Element "title" []
|
||||||
|
[NodeContent "foo"]
|
||||||
|
]
|
||||||
|
, NodeElement $ Element "body" []
|
||||||
|
[ NodeElement $ Element "p" []
|
||||||
|
[NodeContent "Hello World"]
|
||||||
|
]
|
||||||
|
]
|
||||||
|
in parseHtml_ html @?= doc
|
||||||
|
it "HTML" $
|
||||||
|
let html = "<html><head><title>foo</title></head><body><br><p>Hello World</p></body></html>"
|
||||||
|
doc = Document (Prologue [] Nothing []) root []
|
||||||
|
root = Element "html" []
|
||||||
|
[ NodeElement $ Element "head" []
|
||||||
|
[ NodeElement $ Element "title" []
|
||||||
|
[NodeContent "foo"]
|
||||||
|
]
|
||||||
|
, NodeElement $ Element "body" []
|
||||||
|
[ NodeElement $ Element "br" [] []
|
||||||
|
, NodeElement $ Element "p" []
|
||||||
|
[NodeContent "Hello World"]
|
||||||
|
]
|
||||||
|
]
|
||||||
|
in parseHtml_ html @?= doc
|
||||||
|
|||||||
@ -34,9 +34,16 @@ library
|
|||||||
, bytestring >= 0.9
|
, bytestring >= 0.9
|
||||||
, case-insensitive >= 0.2
|
, case-insensitive >= 0.2
|
||||||
, text
|
, text
|
||||||
|
, tagstream-conduit >= 0.3 && < 0.4
|
||||||
|
, conduit >= 0.4 && < 0.5
|
||||||
|
, resourcet >= 0.3 && < 0.4
|
||||||
|
, xml-conduit >= 0.7 && < 0.8
|
||||||
|
, xml-types >= 0.3 && < 0.4
|
||||||
|
, containers
|
||||||
exposed-modules: Yesod.Test
|
exposed-modules: Yesod.Test
|
||||||
Yesod.Test.CssQuery
|
Yesod.Test.CssQuery
|
||||||
Yesod.Test.TransversingCSS
|
Yesod.Test.TransversingCSS
|
||||||
|
Yesod.Test.HtmlParse
|
||||||
ghc-options: -Wall
|
ghc-options: -Wall
|
||||||
|
|
||||||
test-suite test
|
test-suite test
|
||||||
@ -47,6 +54,7 @@ test-suite test
|
|||||||
, yesod-test
|
, yesod-test
|
||||||
, hspec >= 0.9 && < 0.10
|
, hspec >= 0.9 && < 0.10
|
||||||
, HUnit
|
, HUnit
|
||||||
|
, xml-conduit
|
||||||
|
|
||||||
source-repository head
|
source-repository head
|
||||||
type: git
|
type: git
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user