66 lines
1.8 KiB
Haskell
66 lines
1.8 KiB
Haskell
{-# LANGUAGE OverloadedStrings #-}
|
|
-- | Parse an HTML document into xml-conduit's Document.
|
|
--
|
|
-- Assumes UTF-8 encoding.
|
|
module Yesod.Test.HtmlParse
|
|
( parseHtml
|
|
) where
|
|
|
|
import Text.HTML.TagStream
|
|
import qualified Data.ByteString as S
|
|
import qualified Data.ByteString.Lazy as L
|
|
import Text.XML
|
|
import Data.Conduit
|
|
import qualified Data.Conduit.List as CL
|
|
import Data.Functor.Identity (runIdentity)
|
|
import Control.Monad.Trans.Resource (runExceptionT)
|
|
import Data.XML.Types (Event (..), Content (ContentText))
|
|
import Control.Arrow ((***))
|
|
import Data.Text.Encoding (decodeUtf8With)
|
|
import Data.Text.Encoding.Error (lenientDecode)
|
|
import qualified Data.Set as Set
|
|
|
|
parseHtml :: L.ByteString -> Either String Document
|
|
parseHtml lbs =
|
|
either (Left . show) Right
|
|
$ runIdentity
|
|
$ runExceptionT
|
|
$ CL.sourceList (L.toChunks lbs)
|
|
$$ tokenStream =$ (CL.concatMap toEvent =$ fromEvents)
|
|
|
|
toEvent :: Token -> [Event]
|
|
toEvent (TagOpen bsname bsattrs isClose') =
|
|
EventBeginElement name attrs : if isClose then [EventEndElement name] else []
|
|
where
|
|
name = toName bsname
|
|
attrs = map (toName *** (return . ContentText . decodeUtf8With lenientDecode)) bsattrs
|
|
isClose = isClose' || isVoid bsname
|
|
toEvent (TagClose bsname) = [EventEndElement $ toName bsname]
|
|
toEvent (Text bs) = [EventContent $ ContentText $ decodeUtf8With lenientDecode bs]
|
|
toEvent (Comment bs) = [EventComment $ decodeUtf8With lenientDecode bs]
|
|
toEvent Special{} = []
|
|
toEvent Incomplete{} = []
|
|
|
|
toName :: S.ByteString -> Name
|
|
toName bs = Name (decodeUtf8With lenientDecode bs) Nothing Nothing
|
|
|
|
isVoid :: S.ByteString -> Bool
|
|
isVoid = flip Set.member $ Set.fromList
|
|
[ "area"
|
|
, "base"
|
|
, "br"
|
|
, "col"
|
|
, "command"
|
|
, "embed"
|
|
, "hr"
|
|
, "img"
|
|
, "input"
|
|
, "keygen"
|
|
, "link"
|
|
, "meta"
|
|
, "param"
|
|
, "source"
|
|
, "track"
|
|
, "wbr"
|
|
]
|