fix up css parsing

This commit is contained in:
Greg Weber 2011-08-09 09:24:16 -07:00
parent ae3b146712
commit 66545752c6
4 changed files with 81 additions and 44 deletions

View File

@ -59,7 +59,7 @@ If anyone knows of better sources or thinks a particular tag/attribute/value may
style attribute
----------------
style attributes are now *parsed* with the css-text and autoparsec-text dependencies. They are then ran through a white list for properties and keywords. Whitespace is not preserved.
style attributes are now *parsed* with the css-text and autoparsec-text dependencies. They are then ran through a white list for properties and keywords. Whitespace is not preserved. This code was again translated from sanitizer.py, but uses attopoarsec-text instead of regexes.
data attributes
-------------------------

View File

@ -21,6 +21,7 @@ import Network.URI ( parseURIReference, URI (..),
import Codec.Binary.UTF8.String ( encodeString )
import qualified Data.Map as Map
import Data.Maybe (catMaybes)
@ -74,7 +75,7 @@ safeTags (t@(TagClose name):tags)
| otherwise = safeTags tags
safeTags (TagOpen name attributes:tags)
| safeTagName name = TagOpen name
(map sanitizeAttribute $ filter safeAttribute attributes) : safeTags tags
(catMaybes $ map sanitizeAttribute $ filter safeAttribute attributes) : safeTags tags
| otherwise = safeTags tags
safeTags (t:tags) = t:safeTags tags
@ -85,9 +86,10 @@ safeAttribute :: (Text, Text) -> Bool
safeAttribute (name, value) = name `member` sanitaryAttributes &&
(name `notMember` uri_attributes || sanitaryURI value)
sanitizeAttribute :: (Text, Text) -> (Text, Text)
sanitizeAttribute ("style", value) = ("style", sanitizeCSS value)
sanitizeAttribute attrs = attrs
sanitizeAttribute :: (Text, Text) -> Maybe (Text, Text)
sanitizeAttribute ("style", value) =
let css = sanitizeCSS value in if T.null css then Nothing else Just ("style", css)
sanitizeAttribute attr = Just attr
-- | Returns @True@ if the specified URI is not a potential security risk.
@ -174,7 +176,7 @@ acceptable_attributes = ["abbr", "accept", "accept-charset", "accesskey",
"replace", "required", "rev", "rightspacing", "rows", "rowspan",
"rules", "scope", "selected", "shape", "size", "span", "start",
"step",
-- "style", TODO: allow this with further filtering
"style", -- gets further filtering
"summary", "suppress", "tabindex", "target",
"template", "title", "toppadding", "type", "unselectable", "usemap",
"urn", "valign", "value", "variable", "volume", "vspace", "vrml",

View File

@ -1,5 +1,10 @@
{-# LANGUAGE OverloadedStrings #-}
module Text.HTML.SanitizeXSS.Css (sanitizeCSS) where
{-# LANGUAGE OverloadedStrings, CPP #-}
module Text.HTML.SanitizeXSS.Css (
sanitizeCSS
#ifdef TEST
, allowedCssAttributeValue
#endif
) where
import Data.Text (Text)
import qualified Data.Text as T
@ -8,23 +13,48 @@ import Data.Text.Lazy.Builder (toLazyText)
import Data.Text.Lazy (toStrict)
import Data.Set (member, fromList, Set)
import Data.Char (isDigit)
import Control.Applicative ((<|>))
import Control.Applicative ((<|>), pure)
import Text.CSS.Render (renderAttrs)
import Text.CSS.Parse (parseAttrs)
import Prelude hiding (takeWhile)
-- import FileLocation (debug, debugM)
-- this is a direct translation from sanitizer.py, except
-- sanitizer.py filters out url(), but this is redundant
sanitizeCSS :: Text -> Text
sanitizeCSS css = toStrict . toLazyText . renderAttrs . filter isSanitaryAttr $ parseAttributes
sanitizeCSS css = toStrict . toLazyText .
renderAttrs . filter isSanitaryAttr . filterUrl $ parseAttributes
where
filterUrl :: [(Text,Text)] -> [(Text,Text)]
filterUrl = map filterUrlAttribute
where
filterUrlAttribute :: (Text, Text) -> (Text, Text)
filterUrlAttribute (prop,value) =
case parseOnly rejectUrl value of
Left _ -> (prop,value)
Right noUrl -> filterUrlAttribute (prop, noUrl)
rejectUrl = do
pre <- manyTill anyChar (string "url")
skipMany space
_<-char '('
skipWhile (/= ')')
_<-char ')'
rest <- takeText
return $ T.append (T.pack pre) rest
parseAttributes = case parseAttrs css of
Left _ -> []
Right as -> as
isSanitaryAttr (_, "") = False
isSanitaryAttr ("",_) = False
isSanitaryAttr (prop, value)
| prop `member` allowed_css_properties = True
| (T.takeWhile (/= '-') value) `member` allowed_css_unit_properties &&
| (T.takeWhile (/= '-') prop) `member` allowed_css_unit_properties &&
all allowedCssAttributeValue (T.words value) = True
| prop `member` allowed_svg_properties = True
| otherwise = False
@ -32,40 +62,45 @@ sanitizeCSS css = toStrict . toLazyText . renderAttrs . filter isSanitaryAttr $
allowed_css_unit_properties :: Set Text
allowed_css_unit_properties = fromList ["background","border","margin","padding"]
allowedCssAttributeValue :: Text -> Bool
allowedCssAttributeValue val =
val `member` allowed_css_keywords ||
case parseOnly allowedCssAttributeParser val of
Left _ -> False
Right b -> b
where
allowedCssAttributeParser = do
hex <|> rgb <|> cssUnit
allowedCssAttributeValue :: Text -> Bool
allowedCssAttributeValue val =
val `member` allowed_css_keywords ||
case parseOnly allowedCssAttributeParser val of
Left _ -> False
Right b -> b
where
allowedCssAttributeParser = do
rgb <|> hex <|> rgb <|> cssUnit
aToF = fromList "abcdef"
aToF = fromList "abcdef"
hex = do
_ <- char '#'
hx <- takeText
return $ T.all (\c -> isDigit c || (c `member` aToF)) hx
hex = do
_ <- char '#'
hx <- takeText
return $ T.all (\c -> isDigit c || (c `member` aToF)) hx
rgb = do
_<- string "rgb("
skip isDigit >> try (skipWhile isDigit) >> try (skip (== '%'))
skip (== ',')
try (skipWhile isDigit) >> try (skip (== '%'))
skip (== ',')
try (skipWhile isDigit) >> try (skip (== '%'))
skip (== ',')
skip (== ')')
return True
-- should have used sepBy (symbol ",")
rgb = do
_<- string "rgb("
skipMany1 digit >> skipOk (== '%')
skip (== ',')
skipMany digit >> skipOk (== '%')
skip (== ',')
skipMany digit >> skipOk (== '%')
skip (== ')')
return True
cssUnit = do
try $ skip isDigit >> skip isDigit
try $ skip (== '.')
try $ skip isDigit >> skip isDigit
unit <- takeText
return $ unit `member` allowed_css_attribute_value_units
cssUnit = do
skip isDigit
skipOk isDigit
skipOk (== '.')
skipOk isDigit >> skipOk isDigit
skipSpace
unit <- takeText
return $ T.null unit || unit `member` allowed_css_attribute_value_units
skipOk :: (Char -> Bool) -> Parser ()
skipOk p = skip p <|> pure ()
allowed_css_attribute_value_units :: Set Text
allowed_css_attribute_value_units = fromList

View File

@ -21,7 +21,6 @@ library
, css-text >= 0.1 && < 0.2
, text >= 0.11 && < 0.12
, attoparsec-text >= 0.8.5.1 && < 0.9
, file-location
exposed-modules: Text.HTML.SanitizeXSS
@ -31,15 +30,16 @@ library
test-suite test
type: exitcode-stdio-1.0
main-is: test/main.hs
cpp-options: -DTEST
build-depends: base >= 4 && < 5, containers
, xss-sanitize >= 0.3
, tagsoup >= 0.11
, utf8-string >= 0.3
, network >= 2
, css-text >= 0.1 && < 0.2
, text >= 0.11 && < 0.12
, attoparsec-text >= 0.8.5.1 && < 0.9
, file-location
, hspec
, HUnit >= 1.2
source-repository head