From 66545752c6670faa8fc6c537340e2ed19d200b2f Mon Sep 17 00:00:00 2001
From: Greg Weber <greg@gregweber.info>
Date: Tue, 9 Aug 2011 09:24:16 -0700
Subject: [PATCH] fix up css parsing

---
 README.md                    |   2 +-
 Text/HTML/SanitizeXSS.hs     |  12 ++--
 Text/HTML/SanitizeXSS/Css.hs | 105 +++++++++++++++++++++++------------
 xss-sanitize.cabal           |   6 +-
 4 files changed, 81 insertions(+), 44 deletions(-)

diff --git a/README.md b/README.md
index 99f20d0..ebb7e9a 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ If anyone knows of better sources or thinks a particular tag/attribute/value may
 
 style attribute
 ----------------
-style attributes are now *parsed* with the css-text and autoparsec-text dependencies. They are then ran through a white list for properties and keywords. Whitespace is not preserved.
+style attributes are now *parsed* with the css-text and autoparsec-text dependencies. They are then ran through a white list for properties and keywords. Whitespace is not preserved. This code was again translated from sanitizer.py, but uses attopoarsec-text instead of regexes.
 
 data attributes
 -------------------------
diff --git a/Text/HTML/SanitizeXSS.hs b/Text/HTML/SanitizeXSS.hs
index 72f000d..8b1cf65 100644
--- a/Text/HTML/SanitizeXSS.hs
+++ b/Text/HTML/SanitizeXSS.hs
@@ -21,6 +21,7 @@ import Network.URI ( parseURIReference, URI (..),
 import Codec.Binary.UTF8.String ( encodeString )
 
 import qualified Data.Map as Map
+import Data.Maybe (catMaybes)
 
 
 
@@ -74,7 +75,7 @@ safeTags (t@(TagClose name):tags)
     | otherwise = safeTags tags
 safeTags (TagOpen name attributes:tags)
   | safeTagName name = TagOpen name
-      (map sanitizeAttribute $ filter safeAttribute attributes) : safeTags tags
+      (catMaybes $ map sanitizeAttribute $ filter safeAttribute attributes) : safeTags tags
   | otherwise = safeTags tags
 safeTags (t:tags) = t:safeTags tags
 
@@ -85,9 +86,10 @@ safeAttribute :: (Text, Text) -> Bool
 safeAttribute (name, value) = name `member` sanitaryAttributes &&
   (name `notMember` uri_attributes || sanitaryURI value)
 
-sanitizeAttribute :: (Text, Text) -> (Text, Text)
-sanitizeAttribute ("style", value) = ("style", sanitizeCSS value)
-sanitizeAttribute attrs = attrs
+sanitizeAttribute :: (Text, Text) -> Maybe (Text, Text)
+sanitizeAttribute ("style", value) = 
+  let css = sanitizeCSS value in if T.null css then Nothing else Just ("style", css)
+sanitizeAttribute attr = Just attr
          
 
 -- | Returns @True@ if the specified URI is not a potential security risk.
@@ -174,7 +176,7 @@ acceptable_attributes = ["abbr", "accept", "accept-charset", "accesskey",
     "replace", "required", "rev", "rightspacing", "rows", "rowspan",
     "rules", "scope", "selected", "shape", "size", "span", "start",
     "step",
-    -- "style", TODO: allow this with further filtering
+    "style", -- gets further filtering
     "summary", "suppress", "tabindex", "target",
     "template", "title", "toppadding", "type", "unselectable", "usemap",
     "urn", "valign", "value", "variable", "volume", "vspace", "vrml",
diff --git a/Text/HTML/SanitizeXSS/Css.hs b/Text/HTML/SanitizeXSS/Css.hs
index deb286f..108ff00 100644
--- a/Text/HTML/SanitizeXSS/Css.hs
+++ b/Text/HTML/SanitizeXSS/Css.hs
@@ -1,5 +1,10 @@
-{-# LANGUAGE OverloadedStrings #-}
-module Text.HTML.SanitizeXSS.Css (sanitizeCSS) where
+{-# LANGUAGE OverloadedStrings, CPP #-}
+module Text.HTML.SanitizeXSS.Css (
+  sanitizeCSS
+#ifdef TEST
+, allowedCssAttributeValue
+#endif
+  ) where
 
 import Data.Text (Text)
 import qualified Data.Text as T
@@ -8,23 +13,48 @@ import Data.Text.Lazy.Builder (toLazyText)
 import Data.Text.Lazy (toStrict)
 import Data.Set (member, fromList, Set)
 import Data.Char (isDigit)
-import Control.Applicative ((<|>))
+import Control.Applicative ((<|>), pure)
 import Text.CSS.Render (renderAttrs)
 import Text.CSS.Parse (parseAttrs)
+import Prelude hiding (takeWhile)
+
+-- import FileLocation (debug, debugM)
 
 
 -- this is a direct translation from sanitizer.py, except
 --   sanitizer.py filters out url(), but this is redundant
 sanitizeCSS :: Text -> Text
-sanitizeCSS css = toStrict . toLazyText . renderAttrs . filter isSanitaryAttr $ parseAttributes
+sanitizeCSS css = toStrict . toLazyText .
+    renderAttrs . filter isSanitaryAttr . filterUrl $ parseAttributes
   where
+    filterUrl :: [(Text,Text)] -> [(Text,Text)]
+    filterUrl = map filterUrlAttribute
+      where
+        filterUrlAttribute :: (Text, Text) -> (Text, Text)
+        filterUrlAttribute (prop,value) =
+            case parseOnly rejectUrl value of
+              Left _ -> (prop,value)
+              Right noUrl -> filterUrlAttribute (prop, noUrl)
+
+        rejectUrl = do
+          pre <- manyTill anyChar (string "url")
+          skipMany space
+          _<-char '('
+          skipWhile (/= ')')
+          _<-char ')'
+          rest <- takeText
+          return $ T.append (T.pack pre) rest
+
+
     parseAttributes = case parseAttrs css of
       Left _ -> []
       Right as -> as
 
+    isSanitaryAttr (_, "") = False
+    isSanitaryAttr ("",_)  = False
     isSanitaryAttr (prop, value)
       | prop `member` allowed_css_properties = True
-      | (T.takeWhile (/= '-') value) `member` allowed_css_unit_properties &&
+      | (T.takeWhile (/= '-') prop) `member` allowed_css_unit_properties &&
           all allowedCssAttributeValue (T.words value) = True
       | prop `member` allowed_svg_properties = True
       | otherwise = False
@@ -32,40 +62,45 @@ sanitizeCSS css = toStrict . toLazyText . renderAttrs . filter isSanitaryAttr $
     allowed_css_unit_properties :: Set Text
     allowed_css_unit_properties = fromList ["background","border","margin","padding"]
 
-    allowedCssAttributeValue :: Text -> Bool
-    allowedCssAttributeValue val = 
-      val `member` allowed_css_keywords ||
-        case parseOnly allowedCssAttributeParser val of
-            Left _ -> False
-            Right b -> b
-      where
-        allowedCssAttributeParser = do
-          hex <|> rgb <|> cssUnit
+allowedCssAttributeValue :: Text -> Bool
+allowedCssAttributeValue val = 
+  val `member` allowed_css_keywords ||
+    case parseOnly allowedCssAttributeParser val of
+        Left _ -> False
+        Right b -> b
+  where
+    allowedCssAttributeParser = do
+      rgb <|> hex <|> rgb <|> cssUnit
 
-        aToF = fromList "abcdef"
+    aToF = fromList "abcdef"
 
-        hex = do
-          _ <- char '#'
-          hx <- takeText
-          return $ T.all (\c -> isDigit c || (c `member` aToF)) hx
+    hex = do
+      _ <- char '#'
+      hx <- takeText
+      return $ T.all (\c -> isDigit c || (c `member` aToF)) hx
 
-        rgb = do
-          _<- string "rgb("
-          skip isDigit >> try (skipWhile isDigit) >> try (skip (== '%'))
-          skip (== ',')
-          try (skipWhile isDigit) >> try (skip (== '%'))
-          skip (== ',')
-          try (skipWhile isDigit) >> try (skip (== '%'))
-          skip (== ',')
-          skip (== ')')
-          return True
+    -- should have used sepBy (symbol ",")
+    rgb = do
+      _<- string "rgb("
+      skipMany1 digit >> skipOk (== '%')
+      skip (== ',')
+      skipMany digit >> skipOk (== '%')
+      skip (== ',')
+      skipMany digit >> skipOk (== '%')
+      skip (== ')')
+      return True
 
-        cssUnit = do
-          try $ skip isDigit >> skip isDigit
-          try $ skip (== '.')
-          try $ skip isDigit >> skip isDigit
-          unit <- takeText
-          return $ unit `member` allowed_css_attribute_value_units
+    cssUnit = do
+      skip isDigit
+      skipOk isDigit
+      skipOk (== '.')
+      skipOk isDigit >> skipOk isDigit
+      skipSpace
+      unit <- takeText
+      return $ T.null unit || unit `member` allowed_css_attribute_value_units
+
+skipOk :: (Char -> Bool) -> Parser ()
+skipOk p = skip p <|> pure ()
 
 allowed_css_attribute_value_units :: Set Text
 allowed_css_attribute_value_units = fromList
diff --git a/xss-sanitize.cabal b/xss-sanitize.cabal
index 5318c07..0adddf7 100644
--- a/xss-sanitize.cabal
+++ b/xss-sanitize.cabal
@@ -21,7 +21,6 @@ library
                     , css-text >= 0.1 && < 0.2
                     , text                      >= 0.11     && < 0.12
                     , attoparsec-text           >= 0.8.5.1  && < 0.9
-                    , file-location
 
 
     exposed-modules: Text.HTML.SanitizeXSS
@@ -31,15 +30,16 @@ library
 test-suite test
     type: exitcode-stdio-1.0
     main-is: test/main.hs
+    cpp-options: -DTEST
     build-depends:   base  >= 4        && < 5, containers
-                   , xss-sanitize >= 0.3
                     , tagsoup >= 0.11
                     , utf8-string >= 0.3
                     , network >= 2
                     , css-text >= 0.1 && < 0.2
                     , text                      >= 0.11     && < 0.12
                     , attoparsec-text           >= 0.8.5.1  && < 0.9
-                    , file-location
+                    , hspec
+                    , HUnit >= 1.2
 
 
 source-repository head