From 66545752c6670faa8fc6c537340e2ed19d200b2f Mon Sep 17 00:00:00 2001 From: Greg Weber Date: Tue, 9 Aug 2011 09:24:16 -0700 Subject: [PATCH] fix up css parsing --- README.md | 2 +- Text/HTML/SanitizeXSS.hs | 12 ++-- Text/HTML/SanitizeXSS/Css.hs | 105 +++++++++++++++++++++++------------ xss-sanitize.cabal | 6 +- 4 files changed, 81 insertions(+), 44 deletions(-) diff --git a/README.md b/README.md index 99f20d0..ebb7e9a 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ If anyone knows of better sources or thinks a particular tag/attribute/value may style attribute ---------------- -style attributes are now *parsed* with the css-text and autoparsec-text dependencies. They are then ran through a white list for properties and keywords. Whitespace is not preserved. +style attributes are now *parsed* with the css-text and autoparsec-text dependencies. They are then ran through a white list for properties and keywords. Whitespace is not preserved. This code was again translated from sanitizer.py, but uses attopoarsec-text instead of regexes. data attributes ------------------------- diff --git a/Text/HTML/SanitizeXSS.hs b/Text/HTML/SanitizeXSS.hs index 72f000d..8b1cf65 100644 --- a/Text/HTML/SanitizeXSS.hs +++ b/Text/HTML/SanitizeXSS.hs @@ -21,6 +21,7 @@ import Network.URI ( parseURIReference, URI (..), import Codec.Binary.UTF8.String ( encodeString ) import qualified Data.Map as Map +import Data.Maybe (catMaybes) @@ -74,7 +75,7 @@ safeTags (t@(TagClose name):tags) | otherwise = safeTags tags safeTags (TagOpen name attributes:tags) | safeTagName name = TagOpen name - (map sanitizeAttribute $ filter safeAttribute attributes) : safeTags tags + (catMaybes $ map sanitizeAttribute $ filter safeAttribute attributes) : safeTags tags | otherwise = safeTags tags safeTags (t:tags) = t:safeTags tags @@ -85,9 +86,10 @@ safeAttribute :: (Text, Text) -> Bool safeAttribute (name, value) = name `member` sanitaryAttributes && (name `notMember` uri_attributes || sanitaryURI value) -sanitizeAttribute :: (Text, Text) -> (Text, Text) -sanitizeAttribute ("style", value) = ("style", sanitizeCSS value) -sanitizeAttribute attrs = attrs +sanitizeAttribute :: (Text, Text) -> Maybe (Text, Text) +sanitizeAttribute ("style", value) = + let css = sanitizeCSS value in if T.null css then Nothing else Just ("style", css) +sanitizeAttribute attr = Just attr -- | Returns @True@ if the specified URI is not a potential security risk. @@ -174,7 +176,7 @@ acceptable_attributes = ["abbr", "accept", "accept-charset", "accesskey", "replace", "required", "rev", "rightspacing", "rows", "rowspan", "rules", "scope", "selected", "shape", "size", "span", "start", "step", - -- "style", TODO: allow this with further filtering + "style", -- gets further filtering "summary", "suppress", "tabindex", "target", "template", "title", "toppadding", "type", "unselectable", "usemap", "urn", "valign", "value", "variable", "volume", "vspace", "vrml", diff --git a/Text/HTML/SanitizeXSS/Css.hs b/Text/HTML/SanitizeXSS/Css.hs index deb286f..108ff00 100644 --- a/Text/HTML/SanitizeXSS/Css.hs +++ b/Text/HTML/SanitizeXSS/Css.hs @@ -1,5 +1,10 @@ -{-# LANGUAGE OverloadedStrings #-} -module Text.HTML.SanitizeXSS.Css (sanitizeCSS) where +{-# LANGUAGE OverloadedStrings, CPP #-} +module Text.HTML.SanitizeXSS.Css ( + sanitizeCSS +#ifdef TEST +, allowedCssAttributeValue +#endif + ) where import Data.Text (Text) import qualified Data.Text as T @@ -8,23 +13,48 @@ import Data.Text.Lazy.Builder (toLazyText) import Data.Text.Lazy (toStrict) import Data.Set (member, fromList, Set) import Data.Char (isDigit) -import Control.Applicative ((<|>)) +import Control.Applicative ((<|>), pure) import Text.CSS.Render (renderAttrs) import Text.CSS.Parse (parseAttrs) +import Prelude hiding (takeWhile) + +-- import FileLocation (debug, debugM) -- this is a direct translation from sanitizer.py, except -- sanitizer.py filters out url(), but this is redundant sanitizeCSS :: Text -> Text -sanitizeCSS css = toStrict . toLazyText . renderAttrs . filter isSanitaryAttr $ parseAttributes +sanitizeCSS css = toStrict . toLazyText . + renderAttrs . filter isSanitaryAttr . filterUrl $ parseAttributes where + filterUrl :: [(Text,Text)] -> [(Text,Text)] + filterUrl = map filterUrlAttribute + where + filterUrlAttribute :: (Text, Text) -> (Text, Text) + filterUrlAttribute (prop,value) = + case parseOnly rejectUrl value of + Left _ -> (prop,value) + Right noUrl -> filterUrlAttribute (prop, noUrl) + + rejectUrl = do + pre <- manyTill anyChar (string "url") + skipMany space + _<-char '(' + skipWhile (/= ')') + _<-char ')' + rest <- takeText + return $ T.append (T.pack pre) rest + + parseAttributes = case parseAttrs css of Left _ -> [] Right as -> as + isSanitaryAttr (_, "") = False + isSanitaryAttr ("",_) = False isSanitaryAttr (prop, value) | prop `member` allowed_css_properties = True - | (T.takeWhile (/= '-') value) `member` allowed_css_unit_properties && + | (T.takeWhile (/= '-') prop) `member` allowed_css_unit_properties && all allowedCssAttributeValue (T.words value) = True | prop `member` allowed_svg_properties = True | otherwise = False @@ -32,40 +62,45 @@ sanitizeCSS css = toStrict . toLazyText . renderAttrs . filter isSanitaryAttr $ allowed_css_unit_properties :: Set Text allowed_css_unit_properties = fromList ["background","border","margin","padding"] - allowedCssAttributeValue :: Text -> Bool - allowedCssAttributeValue val = - val `member` allowed_css_keywords || - case parseOnly allowedCssAttributeParser val of - Left _ -> False - Right b -> b - where - allowedCssAttributeParser = do - hex <|> rgb <|> cssUnit +allowedCssAttributeValue :: Text -> Bool +allowedCssAttributeValue val = + val `member` allowed_css_keywords || + case parseOnly allowedCssAttributeParser val of + Left _ -> False + Right b -> b + where + allowedCssAttributeParser = do + rgb <|> hex <|> rgb <|> cssUnit - aToF = fromList "abcdef" + aToF = fromList "abcdef" - hex = do - _ <- char '#' - hx <- takeText - return $ T.all (\c -> isDigit c || (c `member` aToF)) hx + hex = do + _ <- char '#' + hx <- takeText + return $ T.all (\c -> isDigit c || (c `member` aToF)) hx - rgb = do - _<- string "rgb(" - skip isDigit >> try (skipWhile isDigit) >> try (skip (== '%')) - skip (== ',') - try (skipWhile isDigit) >> try (skip (== '%')) - skip (== ',') - try (skipWhile isDigit) >> try (skip (== '%')) - skip (== ',') - skip (== ')') - return True + -- should have used sepBy (symbol ",") + rgb = do + _<- string "rgb(" + skipMany1 digit >> skipOk (== '%') + skip (== ',') + skipMany digit >> skipOk (== '%') + skip (== ',') + skipMany digit >> skipOk (== '%') + skip (== ')') + return True - cssUnit = do - try $ skip isDigit >> skip isDigit - try $ skip (== '.') - try $ skip isDigit >> skip isDigit - unit <- takeText - return $ unit `member` allowed_css_attribute_value_units + cssUnit = do + skip isDigit + skipOk isDigit + skipOk (== '.') + skipOk isDigit >> skipOk isDigit + skipSpace + unit <- takeText + return $ T.null unit || unit `member` allowed_css_attribute_value_units + +skipOk :: (Char -> Bool) -> Parser () +skipOk p = skip p <|> pure () allowed_css_attribute_value_units :: Set Text allowed_css_attribute_value_units = fromList diff --git a/xss-sanitize.cabal b/xss-sanitize.cabal index 5318c07..0adddf7 100644 --- a/xss-sanitize.cabal +++ b/xss-sanitize.cabal @@ -21,7 +21,6 @@ library , css-text >= 0.1 && < 0.2 , text >= 0.11 && < 0.12 , attoparsec-text >= 0.8.5.1 && < 0.9 - , file-location exposed-modules: Text.HTML.SanitizeXSS @@ -31,15 +30,16 @@ library test-suite test type: exitcode-stdio-1.0 main-is: test/main.hs + cpp-options: -DTEST build-depends: base >= 4 && < 5, containers - , xss-sanitize >= 0.3 , tagsoup >= 0.11 , utf8-string >= 0.3 , network >= 2 , css-text >= 0.1 && < 0.2 , text >= 0.11 && < 0.12 , attoparsec-text >= 0.8.5.1 && < 0.9 - , file-location + , hspec + , HUnit >= 1.2 source-repository head