From 940c1dddcd07e34de03b87f682ec3711e7f344c5 Mon Sep 17 00:00:00 2001 From: Michael Snoyman Date: Wed, 28 Mar 2012 08:43:36 +0200 Subject: [PATCH] Parse HTML to an XML document via xml-conduit --- yesod-test/Yesod/Test/HtmlParse.hs | 65 ++++++++++++++++++++++++++++++ yesod-test/test/main.hs | 33 +++++++++++++++ yesod-test/yesod-test.cabal | 8 ++++ 3 files changed, 106 insertions(+) create mode 100644 yesod-test/Yesod/Test/HtmlParse.hs diff --git a/yesod-test/Yesod/Test/HtmlParse.hs b/yesod-test/Yesod/Test/HtmlParse.hs new file mode 100644 index 00000000..47082c37 --- /dev/null +++ b/yesod-test/Yesod/Test/HtmlParse.hs @@ -0,0 +1,65 @@ +{-# LANGUAGE OverloadedStrings #-} +-- | Parse an HTML document into xml-conduit's Document. +-- +-- Assumes UTF-8 encoding. +module Yesod.Test.HtmlParse + ( parseHtml + ) where + +import Text.HTML.TagStream +import qualified Data.ByteString as S +import qualified Data.ByteString.Lazy as L +import Text.XML +import Data.Conduit +import qualified Data.Conduit.List as CL +import Data.Functor.Identity (runIdentity) +import Control.Monad.Trans.Resource (runExceptionT) +import Data.XML.Types (Event (..), Content (ContentText)) +import Control.Arrow ((***)) +import Data.Text.Encoding (decodeUtf8With) +import Data.Text.Encoding.Error (lenientDecode) +import qualified Data.Set as Set + +parseHtml :: L.ByteString -> Either String Document +parseHtml lbs = + either (Left . show) Right + $ runIdentity + $ runExceptionT + $ CL.sourceList (L.toChunks lbs) + $$ tokenStream =$ (CL.concatMap toEvent =$ fromEvents) + +toEvent :: Token -> [Event] +toEvent (TagOpen bsname bsattrs isClose') = + EventBeginElement name attrs : if isClose then [EventEndElement name] else [] + where + name = toName bsname + attrs = map (toName *** (return . ContentText . decodeUtf8With lenientDecode)) bsattrs + isClose = isClose' || isVoid bsname +toEvent (TagClose bsname) = [EventEndElement $ toName bsname] +toEvent (Text bs) = [EventContent $ ContentText $ decodeUtf8With lenientDecode bs] +toEvent (Comment bs) = [EventComment $ decodeUtf8With lenientDecode bs] +toEvent Special{} = [] +toEvent Incomplete{} = [] + +toName :: S.ByteString -> Name +toName bs = Name (decodeUtf8With lenientDecode bs) Nothing Nothing + +isVoid :: S.ByteString -> Bool +isVoid = flip Set.member $ Set.fromList + [ "area" + , "base" + , "br" + , "col" + , "command" + , "embed" + , "hr" + , "img" + , "input" + , "keygen" + , "link" + , "meta" + , "param" + , "source" + , "track" + , "wbr" + ] diff --git a/yesod-test/test/main.hs b/yesod-test/test/main.hs index bd3e2bc0..54ef466f 100644 --- a/yesod-test/test/main.hs +++ b/yesod-test/test/main.hs @@ -5,9 +5,12 @@ import Test.Hspec.HUnit () import Yesod.Test.CssQuery import Yesod.Test.TransversingCSS +import Yesod.Test.HtmlParse +import Text.XML parseQuery_ = either error id . parseQuery findBySelector_ x = either error id . findBySelector x +parseHtml_ = either error id . parseHtml main :: IO () main = hspecX $ do @@ -24,3 +27,33 @@ main = hspecX $ do let html = "foo

Hello World

" query = "body > p" in findBySelector_ html query @?= ["

Hello World

"] + describe "HTML parsing" $ do + it "XHTML" $ + let html = "foo

Hello World

" + doc = Document (Prologue [] Nothing []) root [] + root = Element "html" [] + [ NodeElement $ Element "head" [] + [ NodeElement $ Element "title" [] + [NodeContent "foo"] + ] + , NodeElement $ Element "body" [] + [ NodeElement $ Element "p" [] + [NodeContent "Hello World"] + ] + ] + in parseHtml_ html @?= doc + it "HTML" $ + let html = "foo

Hello World

" + doc = Document (Prologue [] Nothing []) root [] + root = Element "html" [] + [ NodeElement $ Element "head" [] + [ NodeElement $ Element "title" [] + [NodeContent "foo"] + ] + , NodeElement $ Element "body" [] + [ NodeElement $ Element "br" [] [] + , NodeElement $ Element "p" [] + [NodeContent "Hello World"] + ] + ] + in parseHtml_ html @?= doc diff --git a/yesod-test/yesod-test.cabal b/yesod-test/yesod-test.cabal index 079e64cf..635484d9 100644 --- a/yesod-test/yesod-test.cabal +++ b/yesod-test/yesod-test.cabal @@ -34,9 +34,16 @@ library , bytestring >= 0.9 , case-insensitive >= 0.2 , text + , tagstream-conduit >= 0.3 && < 0.4 + , conduit >= 0.4 && < 0.5 + , resourcet >= 0.3 && < 0.4 + , xml-conduit >= 0.7 && < 0.8 + , xml-types >= 0.3 && < 0.4 + , containers exposed-modules: Yesod.Test Yesod.Test.CssQuery Yesod.Test.TransversingCSS + Yesod.Test.HtmlParse ghc-options: -Wall test-suite test @@ -47,6 +54,7 @@ test-suite test , yesod-test , hspec >= 0.9 && < 0.10 , HUnit + , xml-conduit source-repository head type: git